diff --git a/sum23/lecture_materials/17_Pandas/lec_17_pandas2_dataframe_template.ipynb b/sum23/lecture_materials/17_Pandas/lec_17_pandas2_dataframe_template.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b775fd1c48bd7b4c55d93339a8a3944dae31e62d --- /dev/null +++ b/sum23/lecture_materials/17_Pandas/lec_17_pandas2_dataframe_template.ipynb @@ -0,0 +1,1208 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pandas import Series, DataFrame\n", + "# We can explictly import Series and DataFrame, why might we do this?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Series Review\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Series from `list`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scores_list = [54, 22, 19, 73, 80]\n", + "scores_series = Series(scores_list)\n", + "scores_series\n", + "\n", + "# What is the terminology for: 0, 1, 2, ... ?? A: \n", + "# What is the terminology for: 54, 22, 19, .... ?? A: " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Selecting certain scores.\n", + "What are all the scores `> 50`?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Answer:** Boolean indexing. Try the following..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scores_series[[True, True, False, False, True]] # often called a \"mask\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are really writing a \"mask\" for our data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Series from `dict`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imagine we hire students and track their weekly hours\n", + "week1 = Series({\"Rita\":5, \"Therese\":3, \"Janice\": 6})\n", + "week2 = Series({\"Rita\":3, \"Therese\":7, \"Janice\": 4})\n", + "week3 = Series({\"Therese\":5, \"Janice\":5, \"Rita\": 8}) # Wrong order! Will this matter?\n", + "print(week1)\n", + "print(week2)\n", + "print(week3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### For everyone in Week 1, add 3 to their hours " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "week1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Total up everyone's hours" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "total_hours = ???\n", + "total_hours" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### What is week1 / week3 ?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "???\n", + "# Notice that we didn't have to worry about the order of indices" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### What type of values are stored in week1 > week2?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(week1)\n", + "print(week2)\n", + "???\n", + "# Notice that indices are ordered the same" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### What is week1 > week3?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(week1)\n", + "print(week3)\n", + "??? # Does it work?\n", + "\n", + "# How can we fix this?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# Lecture 28: Pandas 2 - DataFrames\n", + "\n", + "\n", + "Learning Objectives:\n", + "- Create a DataFrame from \n", + " - a dictionary of Series, lists, or dicts\n", + " - a list of Series, lists, dicts\n", + "- Select a column, row, cell, or rectangular region of a DataFrame\n", + "- Convert CSV files into DataFrames and DataFrames into CSV Files\n", + "- Access the head or tail of a DataFrame" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Big Idea**: Data Frames store 2-dimensional data in tables! It is a collection of Series." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## You can create a DataFrame in a variety of ways!\n", + "\n", + "- dictionary of Series\n", + "- dictionary of lists\n", + "- dictionary of dictionaries\n", + "- list of dictionarines\n", + "- list of lists\n", + "\n", + "### From a dictionary of Series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "names = Series([\"Alice\", \"Bob\", \"Cindy\", \"Dan\"])\n", + "scores = Series([6, 7, 8, 9])\n", + "\n", + "# to make a dictionary of Series, need to write column names for the keys\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From a dictionary of lists" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "name_list = [\"Alice\", \"Bob\", \"Cindy\", \"Dan\"]\n", + "score_list = [6, 7, 8, 9]\n", + "\n", + "# this is the same as above, reminding us that Series act like lists\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From a dictionary of dictionaries\n", + "We need to make up keys to match the things in each column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = {\n", + " \"Player name\": {0: \"Alice\", 1: \"Bob\", 2: \"Cindy\", 3: \"Dan\"},\n", + " \"Score\": {0: 6, 1: 7, 2: 8, 3: 9}\n", + "}\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From a list of dicts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = [\n", + " {\"Player name\": \"Alice\", \"Score\": 6},\n", + " {\"Player name\": \"Bob\", \"Score\": 7},\n", + " {\"Player name\": \"Cindy\", \"Score\": 8},\n", + " {\"Player name\": \"Dan\", \"Score\": 9}\n", + "]\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From a list of lists" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = [\n", + " [\"Alice\", 6],\n", + " [\"Bob\", 7],\n", + " [\"Cindy\", 8],\n", + " [\"Dan\", 9]\n", + "]\n", + "data\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Explicitly naming the columns\n", + "We have to add the column names, we do this with `columns = [name1, name2, ....]`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = [\n", + " [\"Alice\", 6],\n", + " [\"Bob\", 7],\n", + " [\"Cindy\", 8],\n", + " [\"Dan\", 9]\n", + "]\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Explicitly naming the indices\n", + "We can use `index = [name1, name2, ...]` to rename the index of each row" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = [\n", + " {\"Player name\": \"Alice\", \"Score\": 6},\n", + " {\"Player name\": \"Bob\", \"Score\": 7},\n", + " {\"Player name\": \"Cindy\", \"Score\": 8},\n", + " {\"Player name\": \"Dan\", \"Score\": 9}\n", + "]\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: \n", + "# Make a DataFrame of 4 people you know with different ages\n", + "# Give names to both the columns and rows\n", + "\n", + "# Share how you did with this with your neighbor\n", + "# If you both did it the same way, try it a different way." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Select a column, row, cell, or rectangular region of a DataFrame\n", + "### Data lookup: Series\n", + "- `s.loc[X]` <- lookup by pandas index\n", + "- `s.iloc[X]` <- lookup by integer position" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hours = Series({\"Alice\": 6, \"Bob\": 7, \"Cindy\": 8, \"Dan\": 9})\n", + "hours" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lookup Bob's hours by pandas index.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lookup Bob's hours by integer position.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lookup Cindy's hours by pandas index.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data lookup: DataFrame\n", + "\n", + "\n", + "- `d.loc[r]` lookup ROW by pandas ROW index\n", + "- `d.iloc[r]` lookup ROW by ROW integer position\n", + "- `d[c]` lookup COL by pandas COL index\n", + "- `d.loc[r, c]` lookup by pandas ROW index and pandas COL index\n", + "- `d.iloc[r, c]` lookup by ROW integer position and COL integer position" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We often call the object that we make df\n", + "data = [\n", + " [\"Hope\", 10],\n", + " [\"Peace\", 7],\n", + " [\"Joy\", 4],\n", + " [\"Love\", 11]\n", + "]\n", + "df = DataFrame(data, index = [\"H\", \"P\", \"J\", \"L\"], columns = [\"Player name\", \"Score\"])\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What are 3 different ways of accessing row L? " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### How about accessing a column?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What are 3 different ways to access a single cell?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How to set values for a specific entry?\n", + "\n", + "- `d.loc[r, c] = new_val`\n", + "- `d.iloc[r, c] = new_val`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#change player D's name\n", + "df.loc[\"L\", \"Player name\"] = \"Luisa\"\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# then add 3 to that player's score using .loc\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add 7 to a different player's score using .iloc\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find the max score and the mean score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# find the max and mean of the \"Score\" column\n", + "print(df[\"Score\"].max(), df[\"Score\"].mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find the highest scoring player" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Slicing a DataFrame\n", + "\n", + "- `df.iloc[ROW_SLICE, COL_SLICE]` <- make a rectangular slice from the DataFrame using integer positions\n", + "- `df.loc[ROW_SLICE, COL_SLICE]` <- make a rectangular slice from the DataFrame using index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.iloc[1:3, 0:2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[\"P\":\"J\", \"Player name\":\"Score\"] # notice that this way is inclusive of endpoints" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set values for sliced DataFrame\n", + "\n", + "- `d.loc[ROW_SLICE, COL_SLICE] = new_val` <- set value by ROW INDEX and COL INDEX\n", + "- `d.iloc[ROW_SLICE, COL_SLICE] = new_val` <- set value by ROW Integer position and COL Integer position" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[\"P\":\"J\", \"Score\"] += 5\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Pandas allows slicing of non-contiguous columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# just get Player name for Index P and L\n", + "df.loc[[\"P\", \"L\"],\"Player name\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add 2 to the people in rows P and L\n", + "df.loc[[\"P\", \"L\"],\"Score\"] += 2\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Boolean indexing on a DataFrame\n", + "\n", + "- `d[BOOL SERIES]` <- makes a new DF of all rows that lined up were True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Make a Series of Booleans based on Score >= 15" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "b" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### use b to slice the DataFrame\n", + "if b is true, include this row in the new df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### do the last two things in a single step" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating DataFrame from csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# it's that easy! \n", + "df = pd.read_csv(\"IMDB-Movie-Data.csv\")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View the first few lines of the DataFrame\n", + "- `.head(n)` gets the first n lines, 5 is the default" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### get the first 2 rows" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View the first few lines of the DataFrame\n", + "- `.tail(n)` gets the last n lines, 5 is the default" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What are the first and last years in our dataset?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract Year column\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"First year: {}, Last year: {}\".format(???))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What are the rows that correspond to movies whose title contains \"Harry\" ? \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What is the movie at index 6 ? " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Notice that there are two index columns\n", + "- That happened because when you write a csv from pandas to a file, it writes a new index column\n", + "- So if the dataFrame already contains an index, you are going to get two index columns\n", + "- Let's fix that problem" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### How can you use slicing to get just columns with Title and Year?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df2 = ???\n", + "df2\n", + "# notice that this does not have the 'index' column" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### How can you use slicing to get rid of the first column?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.iloc[???] #all the rows, not column 0\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write a df to a csv file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(\"better_movies.csv\", index = False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Practice on your own.....Data Analysis with Data Frames\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What are all the movies that have above average run time (long movies)? " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "long_movies = ???\n", + "long_movies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Which long movie has the lowest rating?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# of these movies, what was the min rating? \n", + "min_rating = ???\n", + "min_rating" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Which movies had this min rating?\n", + "???" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What are all long movies with someone in the cast named \"Emma\" ? " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "???" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What is the title of the shortest movie?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "???" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What movie had the highest revenue?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"Revnue\"].max() # does not work, Why?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We need to clean our data\n", + "# Some movies have M at the end and others don't.\n", + "# All revenues are in millions of dollars.\n", + "def format_revenue(revenue):\n", + " \"\"\" \n", + " Checks the last character of the string and formats accordingly\n", + " \"\"\"\n", + " if type(revenue) == float: # need this in here if we run code multiple times\n", + " return revenue\n", + " elif revenue[-1] == 'M': # some have an \"M\" at the end\n", + " return ??? # TODO: convert relevant part of the string to float and multiple by 1e6\n", + " else:\n", + " return ??? # TODO: convert to float and multiple by 1e6" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# What movie had the highest revenue?\n", + "revenue = df[\"Revenue\"].apply(format_revenue) # apply a function to a column; returns a Series\n", + "print(revenue.head())\n", + "max_revenue = revenue.max()\n", + "\n", + "# make a copy of our df\n", + "rev_df = df.copy()\n", + "rev_df[\"Revenue (float)\"] = revenue\n", + "rev_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Now we can answer the question!\n", + "???" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Or more generally...\n", + "rev_df.sort_values(by = \"Revenue (float)\", ascending = False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What is the average runtime for movies by \"Francis Lawrence\"?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### More complicated questions..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Which director had the highest average rating? \n", + "\n", + "# one way is to make a python dict of director, list of ratings\n", + "director_dict = dict()\n", + "\n", + "# make the dictionary: key is director, value is list of ratings\n", + "for i in range(len(df)):\n", + " director = df.loc[i, \"Director\"]\n", + " rating = df.loc[i, \"Rating\"]\n", + " #print(i, director, rating)\n", + " if director not in director_dict:\n", + " director_dict[director] = []\n", + " director_dict[director].append(rating)\n", + "\n", + "# make a ratings dict key is directory, value is average\n", + "# only include directors with > 4 movies\n", + "ratings_dict = {k:sum(v)/len(v) for (k,v) in director_dict.items() if len(v) > 4}\n", + "\n", + "#sort a dict by values\n", + "dict(sorted(ratings_dict.items(), key=lambda t:t[-1], reverse=True))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# FOR DEMONSTRATION PURPOSES ONLY\n", + "# We haven't (and will not) learn about \"groupby\"\n", + "# Pandas has many operations which will be helpful!\n", + "\n", + "# Consider what you already know, and what Pandas can solve\n", + "# when formulating your solutions.\n", + "rating_groups = df.groupby(\"Director\")[\"Rating\"]\n", + "rating_groups.mean()[rating_groups.count() > 4].sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extra Practice: Make up some of your own questions about the movies" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}