diff --git a/f23/Cole_Lecture_Notes/37_AdvPandas/Lec37_AdvPandas_Solution_Nelson.ipynb b/f23/Cole_Lecture_Notes/37_AdvPandas/Lec37_AdvPandas_Solution_Nelson.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..23f5dc7fbac84ee5935b443ce2259ee78cfbec8b --- /dev/null +++ b/f23/Cole_Lecture_Notes/37_AdvPandas/Lec37_AdvPandas_Solution_Nelson.ipynb @@ -0,0 +1,685 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Advanced Pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CeWtFirwteFY" + }, + "outputs": [], + "source": [ + "# known import statements\n", + "import pandas as pd\n", + "import sqlite3\n", + "import os\n", + "\n", + "# new import statement\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the Piazza data from 'piazza.db'\n", + "\n", + "db_name = \"piazza.db\"\n", + "assert os.path.exists(db_name)\n", + "conn = sqlite3.connect(db_name)\n", + "\n", + "def qry(sql):\n", + " return pd.read_sql(sql, conn)\n", + "\n", + "df = qry(\"\"\"\n", + " SELECT *\n", + " FROM sqlite_master\n", + " WHERE type='table'\n", + "\"\"\")\n", + "print(df.iloc[0]['sql'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "piazza_df = pd.read_sql(\"\"\"\n", + " SELECT *\n", + " FROM piazza\n", + "\"\"\", conn)\n", + "piazza_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 1: Set the student id column as the index\n", + "piazza_df = piazza_df.set_index(\"student_id\")\n", + "piazza_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 2a: Which 10 students post the most?\n", + "top_students = piazza_df[piazza_df[\"role\"] == \"student\"].sort_values(\"posts\", ascending=False).head(10)\n", + "top_students" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 2b: Can you plot their number of posts as a bar graph? Be sure to label your axes!\n", + "ax = top_students[\"posts\"].plot.bar()\n", + "ax.set_xlabel(\"Student ID\")\n", + "ax.set_ylabel(\"# of Posts\")\n", + "ax.set_title(\"Top Posting Students\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 2c: How about with their name rather than their student id?\n", + "ax = top_students.plot.bar(x=\"name\", y=\"posts\")\n", + "ax.set_xlabel(\"Student\")\n", + "ax.set_ylabel(\"# of Posts\")\n", + "ax.set_title(\"Top Posting Students\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# Warmup 3a: Which people had more than 10 answers? Include all roles.\n", + "top_answers = piazza_df[piazza_df[\"answers\"] > 10].sort_values(\"answers\", ascending=False)\n", + "top_answers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 3b: Plot this as a bar graph.\n", + "top_answers[\"answers\"].plot.bar()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 3c: Plot the contributions as a bar graph.\n", + "top_answers[\"role\"].value_counts().plot.bar()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 3d: Can you get this same data using SQL?\n", + "qry(\"\"\"\n", + "SELECT role, COUNT(role) as NumAnswers\n", + "FROM piazza\n", + "WHERE answers > 10\n", + "GROUP BY role\n", + "ORDER BY NumAnswers DESC\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 3e: What about their average # of days online as well?\n", + "qry(\"\"\"\n", + "SELECT role, COUNT(role) as NumAnswers, AVG(days_online) as AvgDaysOnline\n", + "FROM piazza\n", + "WHERE answers > 10\n", + "GROUP BY role\n", + "ORDER BY NumAnswers DESC\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 3f: Can we do that in Pandas as well?\n", + "# Today's topic!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yoLGptrqhbBo" + }, + "source": [ + "# Today's Learning Objectives: \n", + "\n", + "* Setting column as index for pandas `DataFrame`\n", + "* Identify, drop, or fill missing values (`np.NaN`) using Pandas `isna`, `dropna`, and `fillna`\n", + "* Applying transformations to `DataFrame`:\n", + " * Use `apply` on pandas `Series` to apply a transformation function\n", + " * Use `replace` to replace all target values in Pandas `Series` and `DataFrame` rows / columns\n", + "* Filter, aggregate, group, and summarize information in a `DataFrame` with `groupby`\n", + "* Convert .groupby examples to SQL\n", + "* Solving the same question using SQL and pandas `DataFrame` manipulations:\n", + " * filtering, grouping, and aggregation / summarization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by name... What do we notice?\n", + "piazza_df.sort_values(\"name\") # Some names are missing!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Not a Number\n", + "\n", + "- `np.NaN` is the floating point representation of Not a Number\n", + "- You do not need to know / learn the details about the `numpy` package \n", + "\n", + "### Replacing / modifying values within the `DataFrame`\n", + "\n", + "Syntax: `df.replace(<TARGET>, <REPLACE>)`\n", + "\n", + "Let's now replace the missing values (empty strings) with `np.NaN`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's replace these empty strings with a special value.\n", + "piazza_df = piazza_df.replace(\"\", np.NaN)\n", + "piazza_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by name again... What do we notice?\n", + "piazza_df.sort_values(\"name\") # NaN's are at the end!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Checking for missing values\n", + "\n", + "Syntax: `Series.isna()`\n", + "- Returns a boolean Series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run isna() on the name column\n", + "piazza_df[\"name\"].isna()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How many people are missing a name?\n", + "piazza_df[\"name\"].isna().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How many people are missing an email?\n", + "piazza_df[\"email\"].isna().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How many people are missing both a name and email?\n", + "((piazza_df[\"name\"].isna()) & (piazza_df[\"email\"].isna())).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How many people are missing either a name or email?\n", + "((piazza_df[\"name\"].isna()) | (piazza_df[\"email\"].isna())).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# So... What do we do?\n", + "# 1. Drop those rows\n", + "# 2. Interpolate / Best Guess" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Option 1: Drop those rows.\n", + "pure_piazza_df = piazza_df.dropna()\n", + "pure_piazza_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Option 2a: Interpolate / Best Guess\n", + "anon_piazza_df = piazza_df.fillna(\"Anonymous\")\n", + "anon_piazza_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a function to take an email (e.g. \"calm_star@wisc.edu\")\n", + "# and return the name (e.g. \"calm star\")\n", + "def parse_name_from_email(email):\n", + " if pd.isna(email):\n", + " return np.nan\n", + " else:\n", + " return email.split(\"@\")[0].replace(\"_\", \" \")\n", + "\n", + "# Test your function!\n", + "parse_name_from_email(\"calm_star@wisc.edu\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Review: `Pandas.Series.apply(...)`\n", + "Syntax: `Series.apply(<FUNCTION OBJECT REFERENCE>)`\n", + "- applies input function to every element of the Series.\n", + "- Returns a new `Series`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Now, apply that function to each value in email!\n", + "piazza_df[\"guessed_name\"] = piazza_df[\"email\"].apply(parse_name_from_email)\n", + "piazza_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a function to take a name (e.g. \"calm star\")\n", + "# and return the email (e.g. \"calm_star@wisc.edu\")\n", + "def parse_email_from_name(name):\n", + " if pd.isna(name):\n", + " return np.nan\n", + " else:\n", + " return name.replace(\" \", \"_\") + \"@wisc.edu\"\n", + "\n", + "# Test your function!\n", + "parse_email_from_name(\"calm star\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Now, apply that function to each value in name!\n", + "piazza_df[\"guessed_email\"] = piazza_df[\"name\"].apply(parse_email_from_name)\n", + "piazza_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `Pandas.DataFrame.apply(...)`\n", + "Syntax: `DataFrame.apply(<FUNCTION OBJECT REFERENCE>, axis=1)`\n", + "- `axis=1` means apply to each row.\n", + "- returns a new `Series`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If the name has a value, use it, otherwise use our best guess!\n", + "piazza_df[\"name\"] = piazza_df.apply(lambda r : r[\"guessed_name\"] if pd.isna(r[\"name\"]) else r[\"name\"], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Same thing for email!\n", + "piazza_df[\"email\"] = piazza_df.apply(lambda r : r[\"guessed_email\"] if pd.isna(r[\"email\"]) else r[\"email\"], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop the guessing columns\n", + "piazza_df = piazza_df.drop(\"guessed_name\", axis=1)\n", + "piazza_df = piazza_df.drop(\"guessed_email\", axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How many rows are missing data now?\n", + "len(piazza_df.dropna()) # only 12!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Give a name of \"anonymous\" and email of \"anonymous@wisc.edu\"\n", + "# to anyone with left with missing data.\n", + "piazza_df[\"name\"] = piazza_df[\"name\"].fillna(\"anonymous\")\n", + "piazza_df[\"email\"] = piazza_df[\"email\"].fillna(\"anonymous@wisc.edu\")\n", + "len(piazza_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `Pandas.DataFrame.groupby(...)`\n", + "\n", + "Syntax: `DataFrame.groupby(<COLUMN>)`\n", + "- Returns a `groupby` object\n", + "- Need to apply aggregation functions to use the return value of `groupby`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# What does this return?\n", + "piazza_df.groupby(\"role\") # a groupby object!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Try getting the \"mean\" of this groupby object.\n", + "piazza_df.groupby(\"role\").mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How many answers does the average instructor, student, and TA give?\n", + "piazza_df[[\"role\", \"answers\"]].groupby(\"role\").mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How would we write this in SQL?\n", + "qry(\"\"\"\n", + "SELECT role, AVG(answers)\n", + "FROM piazza\n", + "GROUP BY role\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# What is the total number of days spent online for instructors, students, and TAs?\n", + "# Order your answer from lowest to highest\n", + "piazza_df[[\"role\", \"days_online\"]].groupby(\"role\").sum().sort_values(\"days_online\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How would we write this in SQL?\n", + "qry(\"\"\"\n", + "SELECT role, SUM(days_online) as AvgDaysOnline\n", + "FROM piazza\n", + "GROUP BY role\n", + "ORDER BY AvgDaysOnline\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Of those individuals who spend less than 100 days online,\n", + "# how does their average number of posts compare to those that\n", + "# spend 100 days or more online? Do your analysis by role as well.\n", + "\n", + "less_than_100 = piazza_df[piazza_df[\"days_online\"] < 100]\n", + "more_than_100 = piazza_df[piazza_df[\"days_online\"] >= 100]\n", + "\n", + "# In general, they post less...\n", + "print(more_than_100[\"posts\"].mean(), less_than_100[\"posts\"].mean())\n", + "print()\n", + "\n", + "# ... and this is also generally true.\n", + "print(more_than_100[[\"role\", \"posts\"]].groupby(\"role\").mean())\n", + "print(less_than_100[[\"role\", \"posts\"]].groupby(\"role\").mean())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How would we write this in SQL?\n", + "qry(\"\"\"\n", + "SELECT role, AVG(posts) as AvgPosts\n", + "FROM piazza\n", + "WHERE days_online < 100\n", + "GROUP BY role\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "qry(\"\"\"\n", + "SELECT role, AVG(posts) as AvgPosts\n", + "FROM piazza\n", + "WHERE days_online >= 100\n", + "GROUP BY role\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# What percentage of instructors, students, and TAs did not write a single answer,\n", + "# followup, or reply to a followup?\n", + "no_answers = piazza_df[(piazza_df[\"answers\"] == 0) & (piazza_df[\"followups\"] == 0) & (piazza_df[\"replies_to_followups\"] == 0)]\n", + "no_answers[\"role\"].value_counts() / piazza_df[\"role\"].value_counts() * 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How would we write this in SQL?\n", + "# The best we can write (without knowing subqueries) is how many!\n", + "qry(\"\"\"\n", + "SELECT role, COUNT(*)\n", + "FROM piazza\n", + "WHERE answers = 0 AND followups = 0 AND replies_to_followups = 0\n", + "GROUP BY role\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ... and then compare this with the total #!\n", + "qry(\"\"\"\n", + "SELECT role, COUNT(*)\n", + "FROM piazza\n", + "GROUP BY role\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conn.close()" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/f23/Cole_Lecture_Notes/37_AdvPandas/Lec37_AdvPandas_Template_Nelson.ipynb b/f23/Cole_Lecture_Notes/37_AdvPandas/Lec37_AdvPandas_Template_Nelson.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..ad7f23ea92e17af3b4bf742e9f6d51fc4c3ba036 --- /dev/null +++ b/f23/Cole_Lecture_Notes/37_AdvPandas/Lec37_AdvPandas_Template_Nelson.ipynb @@ -0,0 +1,587 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Advanced Pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CeWtFirwteFY" + }, + "outputs": [], + "source": [ + "# known import statements\n", + "import pandas as pd\n", + "import sqlite3\n", + "import os\n", + "\n", + "# new import statement\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the Piazza data from 'piazza.db'\n", + "\n", + "db_name = \"piazza.db\"\n", + "assert os.path.exists(db_name)\n", + "conn = sqlite3.connect(db_name)\n", + "\n", + "def qry(sql):\n", + " return pd.read_sql(sql, conn)\n", + "\n", + "df = qry(\"\"\"\n", + " SELECT *\n", + " FROM sqlite_master\n", + " WHERE type='table'\n", + "\"\"\")\n", + "print(df.iloc[0]['sql'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "piazza_df = pd.read_sql(\"\"\"\n", + " SELECT *\n", + " FROM piazza\n", + "\"\"\", conn)\n", + "piazza_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 1: Set the student id column as the index\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 2a: Which 10 students post the most?\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 2b: Can you plot their number of posts as a bar graph? Be sure to label your axes!\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 2c: How about with their name rather than their student id?\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# Warmup 3a: Which people had more than 10 answers? Include all roles.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 3b: Plot this as a bar graph.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 3c: Plot the contributions as a bar graph.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 3d: Can you get this same data using SQL?\n", + "qry(\"\"\"\n", + "\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 3e: What about their average # of days online as well?\n", + "qry(\"\"\"\n", + "\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Warmup 3f: Can we do that in Pandas as well?\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yoLGptrqhbBo" + }, + "source": [ + "# Today's Learning Objectives: \n", + "\n", + "* Setting column as index for pandas `DataFrame`\n", + "* Identify, drop, or fill missing values (`np.NaN`) using Pandas `isna`, `dropna`, and `fillna`\n", + "* Applying transformations to `DataFrame`:\n", + " * Use `apply` on pandas `Series` to apply a transformation function\n", + " * Use `replace` to replace all target values in Pandas `Series` and `DataFrame` rows / columns\n", + "* Filter, aggregate, group, and summarize information in a `DataFrame` with `groupby`\n", + "* Convert .groupby examples to SQL\n", + "* Solving the same question using SQL and pandas `DataFrame` manipulations:\n", + " * filtering, grouping, and aggregation / summarization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by name... What do we notice?\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Not a Number\n", + "\n", + "- `np.NaN` is the floating point representation of Not a Number\n", + "- You do not need to know / learn the details about the `numpy` package \n", + "\n", + "### Replacing / modifying values within the `DataFrame`\n", + "\n", + "Syntax: `df.replace(<TARGET>, <REPLACE>)`\n", + "\n", + "Let's now replace the missing values (empty strings) with `np.NaN`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's replace these empty strings with a special value.\n", + "piazza_df = ???\n", + "piazza_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by name again... What do we notice?\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Checking for missing values\n", + "\n", + "Syntax: `Series.isna()`\n", + "- Returns a boolean Series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run isna() on the name column\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How many people are missing a name?\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How many people are missing an email?\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How many people are missing both a name and email?\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How many people are missing either a name or email?\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# So... What do we do?\n", + "# 1. Drop those rows\n", + "# 2. Interpolate / Best Guess" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Option 1: Drop those rows.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Option 2a: Interpolate / Best Guess\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a function to take an email (e.g. \"calm_star@wisc.edu\")\n", + "# and return the name (e.g. \"calm star\")\n", + "def parse_name_from_email(email):\n", + " if pd.isna(email):\n", + " return np.nan\n", + " else:\n", + " pass # TODO Parse out the name!\n", + "\n", + "# Test your function!\n", + "parse_name_from_email(\"calm_star@wisc.edu\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Review: `Pandas.Series.apply(...)`\n", + "Syntax: `Series.apply(<FUNCTION OBJECT REFERENCE>)`\n", + "- applies input function to every element of the Series.\n", + "- Returns a new `Series`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Now, apply that function to each value in email!\n", + "piazza_df[\"guessed_name\"] = ???\n", + "piazza_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a function to take a name (e.g. \"calm star\")\n", + "# and return the email (e.g. \"calm_star@wisc.edu\")\n", + "def parse_email_from_name(name):\n", + " pass\n", + "\n", + "# Test your function!\n", + "parse_email_from_name(\"calm star\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Now, apply that function to each value in name!\n", + "piazza_df[\"guessed_email\"] = ???\n", + "piazza_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `Pandas.DataFrame.apply(...)`\n", + "Syntax: `DataFrame.apply(<FUNCTION OBJECT REFERENCE>, axis=1)`\n", + "- `axis=1` means apply to each row.\n", + "- returns a new `Series`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If the name has a value, use it, otherwise use our best guess!\n", + "piazza_df[\"name\"] = piazza_df.apply(lambda r : r[\"guessed_name\"] if pd.isna(r[\"name\"]) else r[\"name\"], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Same thing for email!\n", + "piazza_df[\"email\"] = piazza_df.apply(lambda r : r[\"guessed_email\"] if pd.isna(r[\"email\"]) else r[\"email\"], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop the guessing columns\n", + "piazza_df = piazza_df.drop(\"guessed_name\", axis=1)\n", + "piazza_df = piazza_df.drop(\"guessed_email\", axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How many rows are missing data now?\n", + "len(piazza_df.dropna())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Give a name of \"anonymous\" and email of \"anonymous@wisc.edu\"\n", + "# to anyone with left with missing data.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `Pandas.DataFrame.groupby(...)`\n", + "\n", + "Syntax: `DataFrame.groupby(<COLUMN>)`\n", + "- Returns a `groupby` object\n", + "- Need to apply aggregation functions to use the return value of `groupby`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# What does this return?\n", + "piazza_df.groupby(\"role\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Try getting the \"mean\" of this groupby object.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How many answers does the average instructor, student, and TA give?\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How would we write this in SQL?\n", + "qry(\"\"\"\n", + "\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# What is the total number of days spent online for instructors, students, and TAs?\n", + "# Order your answer from lowest to highest\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How would we write this in SQL?\n", + "qry(\"\"\"\n", + "\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Of those individuals who spend less than 100 days online,\n", + "# how does their average number of posts compare to those that\n", + "# spend 100 days or more online? Do your analysis by role as well.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How would we write this in SQL?\n", + "qry(\"\"\"\n", + "\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# What percentage of instructors, students, and TAs did not write a single answer,\n", + "# followup, or reply to a followup?\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How would we write this in SQL?\n", + "qry(\"\"\"\n", + "\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conn.close()" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/f23/Cole_Lecture_Notes/37_AdvPandas/piazza.db b/f23/Cole_Lecture_Notes/37_AdvPandas/piazza.db new file mode 100644 index 0000000000000000000000000000000000000000..fd42fc45f64189a8a8f6b8013651198c97100fff Binary files /dev/null and b/f23/Cole_Lecture_Notes/37_AdvPandas/piazza.db differ