diff --git a/f23/Gurmail_Lecture_Notes/39_Plotting-4/lec_39_plotting4.ipynb b/f23/Gurmail_Lecture_Notes/39_Plotting-4/lec_39_plotting4.ipynb index 4a588103a43acc984fc7c87a76efedb227b9d1c5..dd88e77af527ce384647ec1776d5a860297c7aeb 100644 --- a/f23/Gurmail_Lecture_Notes/39_Plotting-4/lec_39_plotting4.ipynb +++ b/f23/Gurmail_Lecture_Notes/39_Plotting-4/lec_39_plotting4.ipynb @@ -1,5 +1,20 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Announcements - Monday, December 11\n", + "* Download ALL files for today's lecture\n", + "* <b>If you have any problem with P8-P11 grades, please send me (Gurmail.Singh@wisc.edu) an email by December 11.</b>\n", + "* Late days may not be used on P13\n", + "* If you have questions, it is almost always faster to \n", + " * Post on Piazza\n", + " * Go to [office hours](https://sites.google.com/wisc.edu/cs220-oh-f23/home?pli=1) \n", + "### Conflict Form\n", + " * [Final - December 19, 7:45 am](https://cs220.cs.wisc.edu/f23/surveys.html)" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -2485,7 +2500,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/f23/Gurmail_Lecture_Notes/39_Plotting-4/lec_39_plotting4_template.ipynb b/f23/Gurmail_Lecture_Notes/39_Plotting-4/lec_39_plotting4_Gurmail_lec1.ipynb similarity index 96% rename from f23/Gurmail_Lecture_Notes/39_Plotting-4/lec_39_plotting4_template.ipynb rename to f23/Gurmail_Lecture_Notes/39_Plotting-4/lec_39_plotting4_Gurmail_lec1.ipynb index 504d81cb5ae2cd851d10e4d105784f8b63500a9a..3a1f2531be6e120176fe1d90fda91b44d8bbb643 100644 --- a/f23/Gurmail_Lecture_Notes/39_Plotting-4/lec_39_plotting4_template.ipynb +++ b/f23/Gurmail_Lecture_Notes/39_Plotting-4/lec_39_plotting4_Gurmail_lec1.ipynb @@ -1,5 +1,20 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Announcements - Monday, December 11\n", + "* Download ALL files for today's lecture\n", + "* <b>If you have any problem with P8-P11 grades, please send me (Gurmail.Singh@wisc.edu) an email by December 11.</b>\n", + "* Late days may not be used on P13\n", + "* If you have questions, it is almost always faster to \n", + " * Post on Piazza\n", + " * Go to [office hours](https://sites.google.com/wisc.edu/cs220-oh-f23/home?pli=1) \n", + "### Conflict Form\n", + " * [Final - December 19, 7:45 am](https://cs220.cs.wisc.edu/f23/surveys.html)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -822,7 +837,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/f23/Gurmail_Lecture_Notes/39_Plotting-4/lec_39_plotting4_Gurmail_lec2.ipynb b/f23/Gurmail_Lecture_Notes/39_Plotting-4/lec_39_plotting4_Gurmail_lec2.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..3a1f2531be6e120176fe1d90fda91b44d8bbb643 --- /dev/null +++ b/f23/Gurmail_Lecture_Notes/39_Plotting-4/lec_39_plotting4_Gurmail_lec2.ipynb @@ -0,0 +1,845 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Announcements - Monday, December 11\n", + "* Download ALL files for today's lecture\n", + "* <b>If you have any problem with P8-P11 grades, please send me (Gurmail.Singh@wisc.edu) an email by December 11.</b>\n", + "* Late days may not be used on P13\n", + "* If you have questions, it is almost always faster to \n", + " * Post on Piazza\n", + " * Go to [office hours](https://sites.google.com/wisc.edu/cs220-oh-f23/home?pli=1) \n", + "### Conflict Form\n", + " * [Final - December 19, 7:45 am](https://cs220.cs.wisc.edu/f23/surveys.html)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import statements\n", + "import sqlite3\n", + "import os\n", + "\n", + "import pandas as pd\n", + "from pandas import DataFrame, Series\n", + "\n", + "import matplotlib\n", + "from matplotlib import pyplot as plt\n", + "matplotlib.rcParams[\"font.size\"] = 16\n", + "\n", + "import math\n", + "\n", + "import requests" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bar plots using DataFrames" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Bar Plot Example w/ Fire Hydrants\n", + "\n", + "- General review of pandas\n", + "- Some new bar plot options" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: read \"Fire_Hydrants.csv\" into a DataFrame\n", + "hdf = ???\n", + "hdf.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract just the column names\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's create a *bar plot* to visualize *colors* of fire hydrants." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Make a series called counts_series which stores the value counts of the \"nozzle_color\"\n", + "color_counts = ???\n", + "color_counts # what is wrong with this data?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Clean the data ......use str.upper()\n", + "\n", + "color_counts = ???\n", + "color_counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Make a horizontal bar plot of counts of colors and have the colors match\n", + "# use color list: [\"b\", \"g\", \"darkorange\", \"r\", \"c\", \"0.5\"]\n", + "ax = ???(color = [\"b\", \"g\", \"darkorange\", \"r\", \"c\", \"0.5\"])\n", + "ax.set_xlabel(\"Fire hydrant count\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's create a *bar plot* to visualize *style* of fire hydrants." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Do the same thing as we did for the colors but this time for the \"Style\"\n", + "style_counts = ???\n", + "style_counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "style_counts.plot.bar()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Grab the top 12 \n", + "top12 = ???\n", + "\n", + "# and them add an index to our Series for the sum of all the \"other\" for \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the results\n", + "ax = top12.plot.bar(color = \"firebrick\")\n", + "ax.set_ylabel(\"Hydrant Count\")\n", + "ax.set_xlabel(\"Hydrant Type\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### In what *decade* were *pacers manufactured*?\n", + "### Take a peek at the *Style* column data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hdf[\"Style\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Which *column* gives *year* information?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hdf.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### How to get the *year_manufactured* for *pacers* and *others*?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's get the year manufactured for all of the \"Pacer\" hydrants.\n", + "pacer_years = ???\n", + "\n", + "# Note: We can do this either way\n", + "# pacer_years = hdf[\"year_manufactured\"][hdf[\"Style\"] == \"Pacer\"]\n", + "\n", + "pacer_years" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# then do the same for all the other data\n", + "other_years = ???\n", + "other_years" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### How to get the *decade* for *pacers*?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Round each year down to the start of the decade.\n", + "# e.g. 1987 --> 1980, 2003 --> 2000\n", + "pacer_decades = ???\n", + "pacer_decades" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### How to convert the *decades* back to *int*?\n", + "- `astype(...)` method\n", + "- `dropna(...)` method" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop the NaN values, convert to int, and do value counts\n", + "pacer_decades = ???\n", + "pacer_decades" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### How to *count the decades* for pacers?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pacer_decades_count = ???\n", + "pacer_decades_count" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Count the *decades* for others." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Do the same thing for other_years. Save to a variable called \"other_decades\"\n", + "other_decades = (other_years // 10 * 10).dropna().astype(int)\n", + "other_decades_count = other_decades.value_counts()\n", + "other_decades_count" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Build a DataFrame from a dictionary of key, Series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_df = DataFrame({\n", + " \"pacer\": pacer_decades_count,\n", + " \"other\": other_decades_count,\n", + "})\n", + "plot_df # observe the NaN values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# make a bar plot\n", + "\n", + "ax = ???\n", + "ax.set_xlabel(\"Decade\")\n", + "ax.set_ylabel(\"Hydrant Count\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Ignore data from before 1950 using boolean indexing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ax = ???.plot.bar()\n", + "ax.set_xlabel(\"Decade\")\n", + "ax.set_ylabel(\"Hydrant Count\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Stacked Bar Chart\n", + "`stacked` parameter accepts boolean value as argument" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ax = plot_df[plot_df.index >= 1950].plot.bar(???)\n", + "ax.set_xlabel(\"Decade\")\n", + "ax.set_ylabel(\"Hydrant Count\")\n", + "None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plotting 4\n", + "\n", + "## Learning objectives\n", + "- how to use logarithmic axes\n", + "- how to create multiple plots within same figure" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Logarithmic scale\n", + "- math.log(y, base)\n", + "- find an x, such that 10**x == y\n", + " - math.log10(y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(math.log10(1000))\n", + "print(math.log10(1000000))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(math.log(32, 2))\n", + "print(math.log(256, 4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def log_approx(y):\n", + " assert type(y) == int\n", + " assert y >= 1\n", + " return len(str(y))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(log_approx(123456789)) # What will this output?\n", + "print(math.log10(123456789))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(log_approx(989898))\n", + "print(math.log10(989898))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "errors = []\n", + "for y in range(1, 1000001):\n", + " err = abs(log_approx(y) - math.log10(y))\n", + " errors.append(err)\n", + "max(errors)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Why does this matter?\n", + "- Comparing two numbers:\n", + " - 134234255623423423423432423432432432\n", + " - 2342343252523\n", + "\n", + "- Eventually I don't care what the number is, but only counting the number of digits in the number to know how big the number is!\n", + "- log base 2: counting how many bits we need\n", + "- log base 10: 10 digits 0 through 9!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s = Series([1, 10, 100, 1000, 10000, 100000, 1000000])\n", + "s.plot.line()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s.plot.line(???)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Population example\n", + "https://ourworldindata.org/grapher/population" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "populations = pd.Series({\n", + " \"China\":1439323776,\n", + " \"India\": 1380004385,\n", + " \"Mexico\": 128932753,\n", + " \"Senegal\":16743927,\n", + " \"Bahrain\":1701575,\n", + " \"Grenada\":112523,\n", + " \"Tuvalu\": 11792\n", + "})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot populations as a bar chart." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# not that readable\n", + "???" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now plot on a logarithmic scale." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "???" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Multiple *axessubplots* in the same plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.subplots(ncols = 2, nrows = 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s1 = Series([1, 2, 3, 3, 4])\n", + "s2 = Series([5, 7, 7, 8])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create a single plot with two sub figures (line plots) and plot s1 on the left and s2 on the right." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(ncols = 2)\n", + "# axes[0] # the area on the left\n", + "# axes[1] # the area on the right\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What is wrong with the below plot?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Y-axes are misleading" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How can we fix that?\n", + "- pass argument to `sharey` parameter while invoking subplots function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(ncols = 2, ??)\n", + "# axes[0] # the area on the left\n", + "# axes[1] # the area on the right\n", + "pd.Series([1, 2, 3, 3, 4]).plot.line(ax = axes[0])\n", + "pd.Series([5, 7, 7, 8]).plot.line(ax = axes[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Iris dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Gather the data.\n", + "resp = requests.get(\"https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data\")\n", + "resp.raise_for_status()\n", + "\n", + "iris_f = open(\"iris.csv\", \"w\")\n", + "iris_f.write(resp.text)\n", + "iris_f.close()\n", + "\n", + "iris_df = pd.read_csv(\"iris.csv\",\n", + " names = [\"sep-len\", \"sep-wid\", \"pet-len\", \"pet-wid\", \"class\"])\n", + "iris_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the sepal length vs the sepal width for each of the classes of flowers.\n", + "colors = [\"r\", \"g\", \"b\"]\n", + "markers = [\".\", \"^\", \"v\"]\n", + "\n", + "varieties = list(set(iris_df[\"class\"]))\n", + "\n", + "fig, axes = plt.subplots(ncols = 3, sharex = True, sharey = True, figsize=(12,3))\n", + "for i in range(len(varieties)):\n", + " variety = varieties[i]\n", + " specific_iris_data = iris_df[iris_df[\"class\"] == variety]\n", + " specific_iris_data.plot.scatter(x = \"sep-len\", y = 'sep-wid', \\\n", + " ax = axes[i], color = colors[i], marker = markers[i],\n", + " label = variety)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Self-practice - Student Information Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: establish connection to \"cs220_survey_data.db\"\n", + "path = \"cs220_survey_data.db\"\n", + "assert os.path.exists(path)\n", + "conn = sqlite3.connect(path)\n", + "\n", + "# TODO: determine the table name and column types\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: display all columns of the table\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Using pandas to read \"cs220_survey_data.csv\" into a DataFrame called survey\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a bar plot of ages.\n", + "- x-axis: each unique age\n", + "- y-axis: count of students with those ages.\n", + "\n", + "Things to consider:\n", + "- Do we really want the ages to be a float value? Make the int conversion.\n", + " - Remember the survey dataset has a few rows where \"Age\" column has no value - so handle the NA values before the conversion\n", + "- Think carefully about how to sort the data before you create the plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write equivalent SQL query to retrive the age column values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "# If you repeat the bar plot with SQL version of the data, you will see \n", + "# what are called as \"MultiIndex\" - that is beyond the scope of this course" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a bar plot of Majors.\n", + "- x-axis: each major\n", + "- y-axis: count of students with those majors.\n", + "- represent top 10 major bars and then aggregate the remaining into \"other\" bar." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write equivalent SQL query to retrive the Major column values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "# If you repeat the bar plot with SQL version of the data, you will see \n", + "# what are called as \"MultiIndex\" - that is beyond the scope of this course" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}