updt

a8167294 · Cole Nelson · 9f762d7c · a8167294
Commit a8167294 authored 1 year ago by Cole Nelson
--- a/f23/Cole_Lecture_Notes/37_AdvPandas/Lec37_AdvPandas_Solution_Nelson.ipynb
+++ b/f23/Cole_Lecture_Notes/37_AdvPandas/Lec37_AdvPandas_Solution_Nelson.ipynb
@@ -149,7 +149,7 @@
   "source": [
    "# Warmup 3d: Can you get this same data using SQL?\n",
    "qry(\"\"\"\n",
-    "SELECT role, COUNT(role) as NumAnswers\n",
+    "SELECT role, COUNT(*) as NumAnswers\n",
    "FROM piazza\n",
    "WHERE answers > 10\n",
    "GROUP BY role\n",
@@ -165,7 +165,7 @@
   "source": [
    "# Warmup 3e: What about their average # of days online as well?\n",
    "qry(\"\"\"\n",
-    "SELECT role, COUNT(role) as NumAnswers, AVG(days_online) as AvgDaysOnline\n",
+    "SELECT role, COUNT(*) as NumAnswers, AVG(days_online) as AvgDaysOnline\n",
    "FROM piazza\n",
    "WHERE answers > 10\n",
    "GROUP BY role\n",

 %% Cell type:markdown id: tags:

 # Advanced Pandas

 %% Cell type:code id: tags:

 ``` python
 # known import statements
 import pandas as pd
 import sqlite3
 import os

 # new import statement
 import numpy as np
 ```

 %% Cell type:code id: tags:

 ``` python
 # Get the Piazza data from 'piazza.db'

 db_name = "piazza.db"
 assert os.path.exists(db_name)
 conn = sqlite3.connect(db_name)

 def qry(sql):
    return pd.read_sql(sql, conn)

 df = qry("""
    SELECT *
    FROM sqlite_master
    WHERE type='table'
 """)
 print(df.iloc[0]['sql'])
 ```

 %% Cell type:code id: tags:

 ``` python
 piazza_df = pd.read_sql("""
    SELECT *
    FROM piazza
 """, conn)
 piazza_df.head(5)
 ```

 %% Cell type:code id: tags:

 ``` python
 # Warmup 1: Set the student id column as the index
 piazza_df = piazza_df.set_index("student_id")
 piazza_df
 ```

 %% Cell type:code id: tags:

 ``` python
 # Warmup 2a: Which 10 students post the most?
 top_students = piazza_df[piazza_df["role"] == "student"].sort_values("posts", ascending=False).head(10)
 top_students
 ```

 %% Cell type:code id: tags:

 ``` python
 # Warmup 2b: Can you plot their number of posts as a bar graph? Be sure to label your axes!
 ax = top_students["posts"].plot.bar()
 ax.set_xlabel("Student ID")
 ax.set_ylabel("# of Posts")
 ax.set_title("Top Posting Students")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Warmup 2c: How about with their name rather than their student id?
 ax = top_students.plot.bar(x="name", y="posts")
 ax.set_xlabel("Student")
 ax.set_ylabel("# of Posts")
 ax.set_title("Top Posting Students")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Warmup 3a: Which people had more than 10 answers? Include all roles.
 top_answers = piazza_df[piazza_df["answers"] > 10].sort_values("answers", ascending=False)
 top_answers
 ```

 %% Cell type:code id: tags:

 ``` python
 # Warmup 3b: Plot this as a bar graph.
 top_answers["answers"].plot.bar()
 ```

 %% Cell type:code id: tags:

 ``` python
 # Warmup 3c: Plot the contributions as a bar graph.
 top_answers["role"].value_counts().plot.bar()
 ```

 %% Cell type:code id: tags:

 ``` python
 # Warmup 3d: Can you get this same data using SQL?
 qry("""
-SELECT role, COUNT(role) as NumAnswers
+SELECT role, COUNT(*) as NumAnswers
 FROM piazza
 WHERE answers > 10
 GROUP BY role
 ORDER BY NumAnswers DESC
 """)
 ```

 %% Cell type:code id: tags:

 ``` python
 # Warmup 3e: What about their average # of days online as well?
 qry("""
-SELECT role, COUNT(role) as NumAnswers, AVG(days_online) as AvgDaysOnline
+SELECT role, COUNT(*) as NumAnswers, AVG(days_online) as AvgDaysOnline
 FROM piazza
 WHERE answers > 10
 GROUP BY role
 ORDER BY NumAnswers DESC
 """)
 ```

 %% Cell type:code id: tags:

 ``` python
 # Warmup 3f: Can we do that in Pandas as well?
 # Today's topic!
 ```

 %% Cell type:markdown id: tags:

 # Today's Learning Objectives:

 * Setting column as index for pandas `DataFrame`
 * Identify, drop, or fill missing values (`np.NaN`) using Pandas `isna`, `dropna`, and `fillna`
 * Applying transformations to `DataFrame`:
  * Use `apply` on pandas `Series` to apply a transformation function
  * Use `replace` to replace all target values in Pandas `Series` and `DataFrame` rows / columns
 * Filter, aggregate, group, and summarize information in a `DataFrame` with `groupby`
 * Convert .groupby examples to SQL
 * Solving the same question using SQL and pandas `DataFrame` manipulations:
  * filtering, grouping, and aggregation / summarization

 %% Cell type:code id: tags:

 ``` python
 # Sort by name... What do we notice?
 piazza_df.sort_values("name") # Some names are missing!
 ```

 %% Cell type:markdown id: tags:

 ### Not a Number

 - `np.NaN` is the floating point representation of Not a Number
 - You do not need to know / learn the details about the `numpy` package

 ### Replacing / modifying values within the `DataFrame`

 Syntax: `df.replace(<TARGET>, <REPLACE>)`

 Let's now replace the missing values (empty strings) with `np.NaN`

 %% Cell type:code id: tags:

 ``` python
 # Let's replace these empty strings with a special value.
 piazza_df = piazza_df.replace("", np.NaN)
 piazza_df
 ```

 %% Cell type:code id: tags:

 ``` python
 # Sort by name again... What do we notice?
 piazza_df.sort_values("name") # NaN's are at the end!
 ```

 %% Cell type:markdown id: tags:

 ### Checking for missing values

 Syntax: `Series.isna()`
 - Returns a boolean Series

 %% Cell type:code id: tags:

 ``` python
 # Run isna() on the name column
 piazza_df["name"].isna()
 ```

 %% Cell type:code id: tags:

 ``` python
 # How many people are missing a name?
 piazza_df["name"].isna().value_counts()
 ```

 %% Cell type:code id: tags:

 ``` python
 # How many people are missing an email?
 piazza_df["email"].isna().value_counts()
 ```

 %% Cell type:code id: tags:

 ``` python
 # How many people are missing both a name and email?
 ((piazza_df["name"].isna()) & (piazza_df["email"].isna())).value_counts()
 ```

 %% Cell type:code id: tags:

 ``` python
 # How many people are missing either a name or email?
 ((piazza_df["name"].isna()) | (piazza_df["email"].isna())).value_counts()
 ```

 %% Cell type:code id: tags:

 ``` python
 # So... What do we do?
 #  1. Drop those rows
 #  2. Interpolate / Best Guess
 ```

 %% Cell type:code id: tags:

 ``` python
 # Option 1: Drop those rows.
 pure_piazza_df = piazza_df.dropna()
 pure_piazza_df
 ```

 %% Cell type:code id: tags:

 ``` python
 # Option 2a: Interpolate / Best Guess
 anon_piazza_df = piazza_df.fillna("Anonymous")
 anon_piazza_df
 ```

 %% Cell type:code id: tags:

 ``` python
 # Create a function to take an email (e.g. "calm_star@wisc.edu")
 # and return the name (e.g. "calm star")
 def parse_name_from_email(email):
    if pd.isna(email):
        return np.nan
    else:
        return email.split("@")[0].replace("_", " ")

 # Test your function!
 parse_name_from_email("calm_star@wisc.edu")
 ```

 %% Cell type:markdown id: tags:

 ### Review: `Pandas.Series.apply(...)`
 Syntax: `Series.apply(<FUNCTION OBJECT REFERENCE>)`
 - applies input function to every element of the Series.
 - Returns a new `Series`

 %% Cell type:code id: tags:

 ``` python
 # Now, apply that function to each value in email!
 piazza_df["guessed_name"] = piazza_df["email"].apply(parse_name_from_email)
 piazza_df
 ```

 %% Cell type:code id: tags:

 ``` python
 # Create a function to take a name (e.g. "calm star")
 # and return the email (e.g. "calm_star@wisc.edu")
 def parse_email_from_name(name):
    if pd.isna(name):
        return np.nan
    else:
        return name.replace(" ", "_") + "@wisc.edu"

 # Test your function!
 parse_email_from_name("calm star")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Now, apply that function to each value in name!
 piazza_df["guessed_email"] = piazza_df["name"].apply(parse_email_from_name)
 piazza_df
 ```

 %% Cell type:markdown id: tags:

 ### `Pandas.DataFrame.apply(...)`
 Syntax: `DataFrame.apply(<FUNCTION OBJECT REFERENCE>, axis=1)`
 - `axis=1` means apply to each row.
 - returns a new `Series`

 %% Cell type:code id: tags:

 ``` python
 # If the name has a value, use it, otherwise use our best guess!
 piazza_df["name"] = piazza_df.apply(lambda r : r["guessed_name"] if pd.isna(r["name"]) else r["name"], axis=1)
 ```

 %% Cell type:code id: tags:

 ``` python
 # Same thing for email!
 piazza_df["email"] = piazza_df.apply(lambda r : r["guessed_email"] if pd.isna(r["email"]) else r["email"], axis=1)
 ```

 %% Cell type:code id: tags:

 ``` python
 # Drop the guessing columns
 piazza_df = piazza_df.drop("guessed_name", axis=1)
 piazza_df = piazza_df.drop("guessed_email", axis=1)
 ```

 %% Cell type:code id: tags:

 ``` python
 # How many rows are missing data now?
 len(piazza_df.dropna()) # only 12!
 ```

 %% Cell type:code id: tags:

 ``` python
 # Give a name of "anonymous" and email of "anonymous@wisc.edu"
 # to anyone with left with missing data.
 piazza_df["name"] = piazza_df["name"].fillna("anonymous")
 piazza_df["email"] = piazza_df["email"].fillna("anonymous@wisc.edu")
 len(piazza_df)
 ```

 %% Cell type:markdown id: tags:

 ### `Pandas.DataFrame.groupby(...)`

 Syntax: `DataFrame.groupby(<COLUMN>)`
 - Returns a `groupby` object
 - Need to apply aggregation functions to use the return value of `groupby`

 %% Cell type:code id: tags:

 ``` python
 # What does this return?
 piazza_df.groupby("role") # a groupby object!
 ```

 %% Cell type:code id: tags:

 ``` python
 # Try getting the "mean" of this groupby object.
 piazza_df.groupby("role").mean()
 ```

 %% Cell type:code id: tags:

 ``` python
 # How many answers does the average instructor, student, and TA give?
 piazza_df[["role", "answers"]].groupby("role").mean()
 ```

 %% Cell type:code id: tags:

 ``` python
 # How would we write this in SQL?
 qry("""
 SELECT role, AVG(answers)
 FROM piazza
 GROUP BY role
 """)
 ```

 %% Cell type:code id: tags:

 ``` python
 # What is the total number of days spent online for instructors, students, and TAs?
 # Order your answer from lowest to highest
 piazza_df[["role", "days_online"]].groupby("role").sum().sort_values("days_online")
 ```

 %% Cell type:code id: tags:

 ``` python
 # How would we write this in SQL?
 qry("""
 SELECT role, SUM(days_online) as AvgDaysOnline
 FROM piazza
 GROUP BY role
 ORDER BY AvgDaysOnline
 """)
 ```

 %% Cell type:code id: tags:

 ``` python
 # Of those individuals who spend less than 100 days online,
 # how does their average number of posts compare to those that
 # spend 100 days or more online? Do your analysis by role as well.

 less_than_100 = piazza_df[piazza_df["days_online"] < 100]
 more_than_100 = piazza_df[piazza_df["days_online"] >= 100]

 # In general, they post less...
 print(more_than_100["posts"].mean(), less_than_100["posts"].mean())
 print()

 # ... and this is also generally true.
 print(more_than_100[["role", "posts"]].groupby("role").mean())
 print(less_than_100[["role", "posts"]].groupby("role").mean())
 ```

 %% Cell type:code id: tags:

 ``` python
 # How would we write this in SQL?
 qry("""
 SELECT role, AVG(posts) as AvgPosts
 FROM piazza
 WHERE days_online < 100
 GROUP BY role
 """)
 ```

 %% Cell type:code id: tags:

 ``` python
 qry("""
 SELECT role, AVG(posts) as AvgPosts
 FROM piazza
 WHERE days_online >= 100
 GROUP BY role
 """)
 ```

 %% Cell type:code id: tags:

 ``` python
 # What percentage of instructors, students, and TAs did not write a single answer,
 # followup, or reply to a followup?
 no_answers = piazza_df[(piazza_df["answers"] == 0) & (piazza_df["followups"] == 0) & (piazza_df["replies_to_followups"] == 0)]
 no_answers["role"].value_counts() / piazza_df["role"].value_counts() * 100
 ```

 %% Cell type:code id: tags:

 ``` python
 # How would we write this in SQL?
 # The best we can write (without knowing subqueries) is how many!
 qry("""
 SELECT role, COUNT(*)
 FROM piazza
 WHERE answers = 0 AND followups = 0 AND replies_to_followups = 0
 GROUP BY role
 """)
 ```

 %% Cell type:code id: tags:

 ``` python
 # ... and then compare this with the total #!
 qry("""
 SELECT role, COUNT(*)
 FROM piazza
 GROUP BY role
 """)
 ```

 %% Cell type:code id: tags:

 ``` python
 conn.close()
 ```