Updated lec 38 files

15eaab5c · msyamkumar · d92a7173 · 15eaab5c · 15eaab5c
Commit 15eaab5c authored 2 years ago by msyamkumar
--- a/f22/meena_lec_notes/lec-38/lec_38_plotting3_line_plots.ipynb
+++ b/f22/meena_lec_notes/lec-38/lec_38_plotting3_line_plots.ipynb
--- a/f22/meena_lec_notes/lec-38/lec_38_plotting3_line_plots_template.ipynb
+++ b/f22/meena_lec_notes/lec-38/lec_38_plotting3_line_plots_template.ipynb
 {
 "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ignore this cell (it's just to make certain text red later, but you don't need to understand it).\n",
-    "from IPython.core.display import display, HTML\n",
-    "display(HTML('<style>em { color: red; }</style> <style>.container { width:100% !important; }</style>'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%matplotlib inline"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -50,12 +30,12 @@
   "outputs": [],
   "source": [
    "def f_to_c(f):\n",
-    "    \n",
+    "    return (5/9) * (f-32)\n",
    "\n",
    "# test it by making several calls\n",
-    "print(f_to_c())\n",
+    "print(f_to_c(212))\n",
-    "print(f_to_c())\n",
+    "print(f_to_c(32))\n",
-    "print(f_to_c())"
+    "print(f_to_c(67))"
   ]
  },
  {
@@ -72,8 +52,8 @@
   "outputs": [],
   "source": [
    "# Establish a connection to \"iris-flowers.db\" database\n",
-    "iris_conn = ???\n",
+    "iris_conn = sqlite3.connect(\"iris-flowers.db\")\n",
-    "???"
+    "pd.read_sql(\"SELECT * FROM sqlite_master WHERE type='table'\", iris_conn)"
   ]
  },
  {
@@ -89,7 +69,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "iris_df = ???\n",
+    "iris_df = pd.read_sql(\"SELECT * FROM iris\", iris_conn)\n",
    "iris_df"
   ]
  },
@@ -97,7 +77,39 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "#### Warmup 3: Scatter plot to visualize relationship between `pet-width` and `pet-length`"
+    "#### Warmup 3a: What are all the class types?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# v1: pandas\n",
+    "varietes = iris_df[\"class\"]\n",
+    "varietes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# v2: SQL\n",
+    "varietes = list(pd.read_sql(\"\"\"\n",
+    "    SELECT DISTINCT class\n",
+    "    FROM iris\n",
+    "\"\"\", iris_conn)[\"class\"])\n",
+    "varietes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Warmup 3b: Scatter plot to visualize relationship between `pet-width` and `pet-length`"
   ]
  },
  {
@@ -114,18 +126,21 @@
    "# getting unique class column values\n",
    "varietes = list(set(iris_df[\"class\"]))\n",
    "\n",
-    "plot_area = None\n",
+    "# Iterate over indices of varieties list\n",
+    "# Q: Why are we iterating over indices instead values here?\n",
+    "#    Discuss how it will be useful to extract information from other lists \n",
+    "#    like colors and markers\n",
    "for i in range(len(varietes)):\n",
    "    variety = varietes[i]\n",
+    "    curr_color = ??? # write code to extract color\n",
+    "    curr_marker = ??? # write code to extract marker\n",
    "    \n",
    "    # make a df just of just the data for this variety\n",
    "    variety_df = iris_df[iris_df[\"class\"] == variety] \n",
+    "    # print each subset DataFrame and verify that the output is correct\n",
    "    \n",
    "    #make a scatter plot for this variety\n",
-    "    plot_area = variety_df.plot.scatter(x = \"pet-width\", y = \"pet-length\", \\\n",
+    "    #variety_df.plot.scatter(x = \"pet-width\", y = \"pet-length\")"
-    "                                        label = variety, color = colors[i],\n",
-    "                                        marker = markers[i], \\\n",
-    "                                        ax = plot_area)"
   ]
  },
  {
@@ -159,7 +174,9 @@
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "iris_virginica.plot.scatter(x = \"pet-width\", y = \"pet-length\")"
+   ]
  },
  {
   "cell_type": "markdown",
@@ -176,7 +193,9 @@
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "iris_virginica.plot.scatter(x = \"pet-width\", y = \"pet-length\", xlim = ???, ylim = ???)"
+   ]
  },
  {
   "cell_type": "code",
@@ -204,7 +223,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# How do we extract `pet-length` column Series?\n"
+    "iris_virginica[\"pet-length\"]"
   ]
  },
  {
@@ -272,7 +291,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Close the database connection.\n"
+    "# Close the database connection.\n",
+    "iris_conn.close()"
   ]
  },
  {
@@ -886,7 +906,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.9.12"
  }
 },
 "nbformat": 4,

 %% Cell type:code id: tags:
 ``` python
-# ignore this cell (it's just to make certain text red later, but you don't need to understand it).
-from IPython.core.display import display, HTML
-display(HTML('<style>em { color: red; }</style> <style>.container { width:100% !important; }</style>'))
-```
-%% Cell type:code id: tags:
-``` python
-%matplotlib inline
-```
-%% Cell type:code id: tags:
-``` python
 # import statements
 import sqlite3
 import pandas as pd
 from pandas import DataFrame, Series
 import matplotlib
 from matplotlib import pyplot as plt
 matplotlib.rcParams["font.size"] = 16
 ```
 %% Cell type:markdown id: tags:
 #### Warmup 1: Write a function that converts any Fehrenheit temp to Celcius
 C = (5/9) * (f-32)
 %% Cell type:code id: tags:
 ``` python
 def f_to_c(f):
+    return (5/9) * (f-32)
 # test it by making several calls
-print(f_to_c())
+print(f_to_c(212))
-print(f_to_c())
+print(f_to_c(32))
-print(f_to_c())
+print(f_to_c(67))
 ```
 %% Cell type:markdown id: tags:
 #### Warmup 2a: What is the name of the only table inside of iris-flowers.db?
 %% Cell type:code id: tags:
 ``` python
 # Establish a connection to "iris-flowers.db" database
-iris_conn = ???
+iris_conn = sqlite3.connect("iris-flowers.db")
-???
+pd.read_sql("SELECT * FROM sqlite_master WHERE type='table'", iris_conn)
 ```
 %% Cell type:markdown id: tags:
 #### Warmup 2b: Save & display all the data from this table to a variable called "iris_df"
 %% Cell type:code id: tags:
 ``` python
-iris_df = ???
+iris_df = pd.read_sql("SELECT * FROM iris", iris_conn)
 iris_df
 ```
 %% Cell type:markdown id: tags:
-#### Warmup 3: Scatter plot to visualize relationship between `pet-width` and `pet-length`
+#### Warmup 3a: What are all the class types?
+%% Cell type:code id: tags:
+``` python
+# v1: pandas
+varietes = iris_df["class"]
+varietes
+```
+%% Cell type:code id: tags:
+``` python
+# v2: SQL
+varietes = list(pd.read_sql("""
+    SELECT DISTINCT class
+    FROM iris
+""", iris_conn)["class"])
+varietes
+```
+%% Cell type:markdown id: tags:
+#### Warmup 3b: Scatter plot to visualize relationship between `pet-width` and `pet-length`
 %% Cell type:code id: tags:
 ``` python
 # complete this code to make 3 plots in one
 colors = ["blue", "green", "red"]
 markers = ["o", "^", "v"]
 # getting unique class column values
 varietes = list(set(iris_df["class"]))
-plot_area = None
+# Iterate over indices of varieties list
+# Q: Why are we iterating over indices instead values here?
+#    Discuss how it will be useful to extract information from other lists
+#    like colors and markers
 for i in range(len(varietes)):
    variety = varietes[i]
+    curr_color = ??? # write code to extract color
+    curr_marker = ??? # write code to extract marker
    # make a df just of just the data for this variety
    variety_df = iris_df[iris_df["class"] == variety]
+    # print each subset DataFrame and verify that the output is correct
    #make a scatter plot for this variety
-    plot_area = variety_df.plot.scatter(x = "pet-width", y = "pet-length", \
+    #variety_df.plot.scatter(x = "pet-width", y = "pet-length")
-                                        label = variety, color = colors[i],
-                                        marker = markers[i], \
-                                        ax = plot_area)
 ```
 %% Cell type:markdown id: tags:
 #### Let's focus on "Iris-virginica" data
 %% Cell type:code id: tags:
 ``` python
 iris_virginica = ???
 # assert that length of iris_virginica is exactly 50
 ???
 iris_virginica.head()
 ```
 %% Cell type:markdown id: tags:
 #### Create scatter plot to visualize relationship between `pet-width` and `pet-length`
 %% Cell type:code id: tags:
 ``` python
+iris_virginica.plot.scatter(x = "pet-width", y = "pet-length")
 ```
 %% Cell type:markdown id: tags:
 ### Let's learn about *xlim* and *ylim*
 - Allows us to set x-axis and y-axis limits
 - Takes either a single value (LOWER-BOUND) or a tuple containing two values (LOWER-BOUND, UPPER-BOUND)
 - You need to be careful about setting the UPPER-BOUND
 %% Cell type:code id: tags:
 ``` python
+iris_virginica.plot.scatter(x = "pet-width", y = "pet-length", xlim = ???, ylim = ???)
 ```
 %% Cell type:code id: tags:
 ``` python
 ax = iris_virginica.plot.scatter(x = "pet-width", y = "pet-length",
                    xlim = ???, ylim = ???,
                    figsize = (3, 3))
 # What is wrong with this plot?
 ```
 %% Cell type:markdown id: tags:
 What is the maximum `pet-length`?
 %% Cell type:code id: tags:
 ``` python
-# How do we extract `pet-length` column Series?
+iris_virginica["pet-length"]
 ```
 %% Cell type:markdown id: tags:
 For every set method, there is a corresponding get method. Try `ax.get_ylim()`.
 %% Cell type:code id: tags:
 ``` python
 ```
 %% Cell type:markdown id: tags:
 Let's include assert statements to make sure we don't crop the plot!
 %% Cell type:code id: tags:
 ``` python
 ax = iris_virginica.plot.scatter(x = "pet-width", y = "pet-length",
                     xlim = (0, 6), ylim = (0, 6),
                     figsize = (3, 3))
 #print("Ran into AssertionError while checking axes limits")
 ```
 %% Cell type:markdown id: tags:
 ### Now let's try all 4 assert statements
 ```
 assert iris_virginica[ax.get_xlabel()].min() >= ax.get_xlim()[0]
 assert iris_virginica[ax.get_xlabel()].max() <= ax.get_xlim()[1]
 assert iris_virginica[ax.get_ylabel()].min() >= ax.get_ylim()[0]
 assert iris_virginica[ax.get_ylabel()].max() <= ax.get_ylim()[1]
 ```
 %% Cell type:code id: tags:
 ``` python
 ax = iris_virginica.plot.scatter(x = "pet-width", y = "pet-length",
                     xlim = (0, 7), ylim = (0, 7),
                     figsize = (3, 3))
 ```
 %% Cell type:code id: tags:
 ``` python
 # Close the database connection.
+iris_conn.close()
 ```
 %% Cell type:markdown id: tags:
 # Plotting Applications
 **Learning Objectives**
 - Make a line plot on a series or on a DataFrame
 - Apply features of line plots and bar plots to visualize results of data investigations
 - Clean Series data by dropping NaN values and by converting to int
 - Make a stacked bar plot
 %% Cell type:markdown id: tags:
 ## Line plots
 - `SERIES.plot.line()`       each value in the Series becomes y-value and each index becomes x-value
 - `DATAFRAME.plot.line()`    each column in the data frame becomes a line in the plot
 - ***IMPORTANT***: lines in line plots shouldn't be crooked, you need to sort the values based on increasing order of indices!
 https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.line.html
 %% Cell type:markdown id: tags:
 ### Plotting line from a Series
 %% Cell type:code id: tags:
 ``` python
 # when you make a series from a list, the default indices 0, 1, 2, ...
 s = Series([0, 100, 300, 200, 400])
 s
 ```
 %% Cell type:code id: tags:
 ``` python
 s = Series([0, 100, 300, 200, 400], index = [0, 20, 21, 22, 1])
 s # oops this produces a crooked line plot!
 ```
 %% Cell type:code id: tags:
 ``` python
 # Let's fix it by sorting the Series values based on the indices
 ```
 %% Cell type:markdown id: tags:
 ### Craft breweries example
 %% Cell type:code id: tags:
 ``` python
 # You can make a series from a list and add indices
 s = Series([1758, 2002, 2408, 2898, 3814, 4803, 5713, 6661, 7618, 8391, 8764], \
           index=[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
 # We can save the AxesSubplot and "beautify" it like the other plots...
 # Set title to "Craft Breweries in the USA"
 # Set x-axis label to "Year"
 # Set y-axis label to "# Craft Breweries"
 ```
 %% Cell type:code id: tags:
 ``` python
 # Be careful! If the indices are out of order you get a mess
 # pandas plots each (index, value) in the order given
 s = Series([1758, 2408, 2898, 3814, 4803, 5713, 6661, 7618, 8391, 8764, 2002], \
           index=[2010, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2011])
 # TODO: fix this crooked line plot
 s.plot.line()
 ```
 %% Cell type:code id: tags:
 ``` python
 # Fix it here
 ```
 %% Cell type:markdown id: tags:
 ### Temperature example
 Plotting lines from a DataFrame
 - `DATAFRAME.plot.line()`    each column in the data frame becomes a line in the plot
 - ***IMPORTANT***: lines in line plots shouldn't be crooked, you need to sort the values based on increasing order of indices!
 %% Cell type:code id: tags:
 ``` python
 # This DataFrame is made using a dict of lists
 # City of Madison normal high and low (degrees F) by month
 temp_df = DataFrame( {
    "high": [26, 31, 43, 57, 68, 78, 82, 79, 72, 59, 44, 30],
    "low": [11, 15, 25, 36, 46, 56, 61, 59, 50, 39, 28, 16]}
 )
 # Q: do "high" and "low" become rows or columns within the DataFrame?
 # A:
 temp_df
 ```
 %% Cell type:code id: tags:
 ``` python
 # Let's create line plots
 # not a nice plot
 # Let's fix the aesthetics
 ```
 %% Cell type:markdown id: tags:
 ### A Line Plot made from a DataFrame automatically plots all columns
 The same is true for bar plots; we'll see this later.
 `ax.xticks(...)`: takes as argument a sequence of numbers and add ticks at those locations.
 %% Cell type:code id: tags:
 ``` python
 # You can also add ticks and ticklabels to a line plot
 # TODOs:
 # 1. Also add figure size as (8, 4)
 # 2. Add xticks - how many do we need?
 # 3. Add xticklables and rotate them by 45 degrees
 #["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
 ax = temp_df.plot.line(???)
 ax.set_title("Average Temperatures in Madison, WI")
 ax.set_xlabel("Month")
 ax.set_ylabel("Temp (Fahrenheit)")
 ax.set_xticks(???)   # makes a sequence of integers from 0 to 11
 ax.set_xticklabels(???, ???)
 # This gets rid of the weird output
 None
 ```
 %% Cell type:code id: tags:
 ``` python
 # We could explicitly pass arguments to the "x" and "y" parameters
 temp_df_with_month = DataFrame(
    {
    "month": ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
                   "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"],
    "high": [26, 31, 43, 57, 68, 78, 82, 79, 72, 59, 44, 30],
    "low": [11, 15, 25, 36, 46, 56, 61, 59, 50, 39, 28, 16]}
 )
 ax = temp_df_with_month.plot.line(x = ???, y = ???, figsize = (8, 4))
 ax.set_title("Average Temperatures in Madison, WI")
 ax.set_xlabel("Month")
 ax.set_ylabel("Temp (Fahrenheit)")
 ```
 %% Cell type:markdown id: tags:
 ### We can perform a calculation on an entire DataFrame
 Let's change the entire DataFrame to Celcius
 %% Cell type:code id: tags:
 ``` python
 # call the function on the dataframe
 celcius_df = ???
 celcius_df
 ```
 %% Cell type:code id: tags:
 ``` python
 # here is one way to add a horizontal line to our line plots
 celcius_df[???] = ???
 celcius_df
 ```
 %% Cell type:code id: tags:
 ``` python
 # this plots each column as lines
 # with rotation for the tick labels
 ax = celcius_df.plot.line(figsize = (8, 4))
 ax.set_xlabel("Month")
 ax.set_ylabel("Temp (Celcius)")
 ax.set_xticks(range(12))
 ax.set_xticklabels(["Jan", "Feb", "Mar", "Apr", "May", "Jun",
                    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], rotation = 45)
 ax.grid()
 None
 ```
 %% Cell type:markdown id: tags:
 ## Bar plots using DataFrames
 %% Cell type:markdown id: tags:
 Bar Plot Example w/ Fire Hydrants
 - General review of pandas
 - Some new bar plot options
 %% Cell type:code id: tags:
 ``` python
 # TODO: read "Fire_Hydrants.csv" into a DataFrame
 hdf = ???
 hdf.tail()
 ```
 %% Cell type:code id: tags:
 ``` python
 # Extract just the column names
 ```
 %% Cell type:markdown id: tags:
 ### Let's create a *bar plot* to visualize *colors* of fire hydrants.
 %% Cell type:code id: tags:
 ``` python
 # Make a series called counts_series which stores the value counts of the "nozzle_color"
 color_counts = ???
 color_counts # what is wrong with this data?
 ```
 %% Cell type:code id: tags:
 ``` python
 # TODO: Clean the data ......use str.upper()
 color_counts = ???
 color_counts
 ```
 %% Cell type:code id: tags:
 ``` python
 # Make a horizontal bar plot of counts of colors and have the colors match
 # use color list: ["b", "g", "darkorange", "r", "c", "0.5"]
 ax = ???
 ax.set_xlabel("Fire hydrant count")
 ```
 %% Cell type:markdown id: tags:
 ### Let's create a *bar plot* to visualize *style* of fire hydrants.
 %% Cell type:code id: tags:
 ``` python
 # Do the same thing as we did for the colors but this time for the "Style"
 style_counts = ???
 style_counts
 ```
 %% Cell type:code id: tags:
 ``` python
 ```
 %% Cell type:code id: tags:
 ``` python
 # Grab the top 12
 top12 = ???
 # and them add an index to our Series for the sum of all the "other" for
 top12[???] = ???
 ```
 %% Cell type:code id: tags:
 ``` python
 # Plot the results
 ax = ???(color = "firebrick")
 ax.set_ylabel("Hydrant Count")
 ax.set_xlabel("Hydrant Type")
 ```
 %% Cell type:markdown id: tags:
 ### In what *decade* were *pacers manufactured*?
 ### Take a peek at the *Style* column data
 %% Cell type:code id: tags:
 ``` python
 hdf["Style"]
 ```
 %% Cell type:markdown id: tags:
 ### Which *column* gives *year* information?
 %% Cell type:code id: tags:
 ``` python
 hdf.columns
 ```
 %% Cell type:markdown id: tags:
 ### How to get the *year_manufactured* for *pacers* and *others*?
 %% Cell type:code id: tags:
 ``` python
 # Let's get the year manufactured for all of the "Pacer" hydrants.
 pacer_years = ???
 # Note: We can do this either way
 # pacer_years = hdf["year_manufactured"][hdf["Style"] == "Pacer"]
 pacer_years
 ```
 %% Cell type:code id: tags:
 ``` python
 # then do the same for all the other data
 other_years = ???
 other_years
 ```
 %% Cell type:markdown id: tags:
 ### How to get the *decade* for *pacers*?
 %% Cell type:code id: tags:
 ``` python
 # Round each year down to the start of the decade.
 # e.g. 1987 --> 1980, 2003 --> 2000
 pacer_decades = ???
 pacer_decades
 ```
 %% Cell type:markdown id: tags:
 ### How to convert the *decades* back to *int*?
 - `astype(...)` method
 - `dropna(...)` method
 %% Cell type:code id: tags:
 ``` python
 # Drop the NaN values, convert to int, and do value counts
 pacer_decades = ???
 ```
 %% Cell type:markdown id: tags:
 ### How to *count the decades* for pacers?
 %% Cell type:code id: tags:
 ``` python
 pacer_decades_count = ???
 pacer_decades_count
 ```
 %% Cell type:markdown id: tags:
 ### Count the *decades* for others.
 %% Cell type:code id: tags:
 ``` python
 # Do the same thing for other_years. Save to a variable called "other_decades"
 other_decades = ???
 other_decades_count = ???
 other_decades_count
 ```
 %% Cell type:markdown id: tags:
 ### Build a DataFrame from a dictionary of key, Series
 %% Cell type:code id: tags:
 ``` python
 plot_df = DataFrame(???)
 plot_df # observe the NaN values
 ```
 %% Cell type:code id: tags:
 ``` python
 # make a bar plot
 ax = ???
 ax.set_xlabel("Decade")
 ax.set_ylabel("Hydrant Count")
 ```
 %% Cell type:markdown id: tags:
 #### Ignore data from before 1950 using boolean indexing.
 %% Cell type:code id: tags:
 ``` python
 ax = ???
 ax.set_xlabel("Decade")
 ax.set_ylabel("Hydrant Count")
 ```
 %% Cell type:markdown id: tags:
 ### Stacked Bar Chart
 `stacked` parameter accepts boolean value as argument
 %% Cell type:code id: tags:
 ``` python
 ax = ???
 ax.set_xlabel("Decade")
 ax.set_ylabel("Hydrant Count")
 None
 ```