diff --git a/sum23/lecture_materials/17_Pandas/lec_17_pandas2_dataframe_notes.ipynb b/sum23/lecture_materials/17_Pandas/lec_17_pandas2_dataframe_notes.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..3eec221ae119414536d23cdd96035ae63b9c13f0 --- /dev/null +++ b/sum23/lecture_materials/17_Pandas/lec_17_pandas2_dataframe_notes.ipynb @@ -0,0 +1,3790 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pandas import Series, DataFrame\n", + "# We can explictly import Series and DataFrame, why might we do this?\n", + "# we're lazy! can type Series instead of pd.Series" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Series Review\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Series from `list`" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 54\n", + "1 22\n", + "2 19\n", + "3 73\n", + "4 80\n", + "dtype: int64" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores_list = [54, 22, 19, 73, 80]\n", + "scores_series = Series(scores_list)\n", + "scores_series\n", + "\n", + "# What is the terminology for: 0, 1, 2, ... ?? A: index \n", + " # (Integer positions are the same as index when creating from a list)\n", + "# What is the terminology for: 54, 22, 19, .... ?? A: values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Selecting certain scores.\n", + "What are all the scores `> 50`?" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 54\n", + "3 73\n", + "4 80\n", + "dtype: int64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores_series[ scores_series > 50 ]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Answer:** Boolean indexing. Try the following..." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 54\n", + "1 22\n", + "4 80\n", + "dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores_series[[True, True, False, False, True]] # often called a \"mask\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are really writing a \"mask\" for our data." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 True\n", + "1 False\n", + "2 False\n", + "3 True\n", + "4 True\n", + "dtype: bool" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mask = scores_series>50\n", + "mask" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 54\n", + "3 73\n", + "4 80\n", + "dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores_series[ mask ]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Series from `dict`" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rita 5\n", + "Therese 3\n", + "Janice 6\n", + "dtype: int64\n", + "Rita 3\n", + "Therese 7\n", + "Janice 4\n", + "dtype: int64\n", + "Therese 5\n", + "Janice 5\n", + "Rita 8\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "# Imagine we hire students and track their weekly hours\n", + "week1 = Series({\"Rita\":5, \"Therese\":3, \"Janice\": 6})\n", + "week2 = Series({\"Rita\":3, \"Therese\":7, \"Janice\": 4})\n", + "week3 = Series({\"Therese\":5, \"Janice\":5, \"Rita\": 8}) # Wrong order! Will this matter?\n", + "print(week1)\n", + "print(week2)\n", + "print(week3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### For everyone in Week 1, add 3 to their hours " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Rita 8\n", + "Therese 6\n", + "Janice 9\n", + "dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "week1 = week1 + 3\n", + "week1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Total up everyone's hours" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Janice 18\n", + "Rita 19\n", + "Therese 18\n", + "dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "total_hours = week1 + week2 + week3\n", + "total_hours" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### What is week1 / week3 ?" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Janice 1.8\n", + "Rita 1.0\n", + "Therese 1.2\n", + "dtype: float64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "week1 / week3\n", + "# Notice that we didn't have to worry about the order of indices" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### What type of values are stored in week1 > week2?" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rita 8\n", + "Therese 6\n", + "Janice 9\n", + "dtype: int64\n", + "Rita 3\n", + "Therese 7\n", + "Janice 4\n", + "dtype: int64\n" + ] + }, + { + "data": { + "text/plain": [ + "Rita True\n", + "Therese False\n", + "Janice True\n", + "dtype: bool" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(week1)\n", + "print(week2)\n", + "week1 > week2\n", + "# Notice that indices are ordered the same" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### What is week1 > week3?" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rita 8\n", + "Therese 6\n", + "Janice 9\n", + "dtype: int64\n", + "Therese 5\n", + "Janice 5\n", + "Rita 8\n", + "dtype: int64\n" + ] + }, + { + "data": { + "text/plain": [ + "Janice True\n", + "Rita False\n", + "Therese True\n", + "dtype: bool" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(week1)\n", + "print(week3)\n", + "#week1 > week3 # Does it work?\n", + "\n", + "# How can we fix this?\n", + "week1.sort_index() > week3.sort_index()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# Lecture 28: Pandas 2 - DataFrames\n", + "\n", + "\n", + "Learning Objectives:\n", + "- Create a DataFrame from \n", + " - a dictionary of Series, lists, or dicts\n", + " - a list of Series, lists, dicts\n", + "- Select a column, row, cell, or rectangular region of a DataFrame\n", + "- Convert CSV files into DataFrames and DataFrames into CSV Files\n", + "- Access the head or tail of a DataFrame" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Big Idea**: Data Frames store 2-dimensional data in tables! It is a collection of Series." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## You can create a DataFrame in a variety of ways!\n", + "\n", + "- dictionary of Series\n", + "- dictionary of lists\n", + "- dictionary of dictionaries\n", + "- list of dictionarines\n", + "- list of lists\n", + "\n", + "### From a dictionary of Series" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player Name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Alice</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Bob</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Cindy</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Dan</td>\n", + " <td>9</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player Name Score\n", + "0 Alice 6\n", + "1 Bob 7\n", + "2 Cindy 8\n", + "3 Dan 9" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "names = Series([\"Alice\", \"Bob\", \"Cindy\", \"Dan\"])\n", + "scores = Series([6, 7, 8, 9])\n", + "\n", + "dict_of_series = {\"Player Name\": names, \"Score\" : scores}\n", + "\n", + "df = DataFrame(dict_of_series)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From a dictionary of lists" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player Name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Alice</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Bob</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Cindy</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Dan</td>\n", + " <td>9</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player Name Score\n", + "0 Alice 6\n", + "1 Bob 7\n", + "2 Cindy 8\n", + "3 Dan 9" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "name_list = [\"Alice\", \"Bob\", \"Cindy\", \"Dan\"]\n", + "score_list = [6, 7, 8, 9]\n", + "\n", + "# this is the same as above, reminding us that Series act like lists\n", + "dict_of_lists = { \"Player Name\": name_list, \"Score\": score_list }\n", + "\n", + "df = DataFrame(\n", + " dict_of_lists\n", + ")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From a dictionary of dictionaries\n", + "We need to make up keys to match the things in each column" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Alice</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Bob</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Cindy</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Dan</td>\n", + " <td>9</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "0 Alice 6\n", + "1 Bob 7\n", + "2 Cindy 8\n", + "3 Dan 9" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = {\n", + " \"Player name\": {0: \"Alice\", 1: \"Bob\", 2: \"Cindy\", 3: \"Dan\"},\n", + " \"Score\": {0: 6, 1: 7, 2: 8, 3: 9}\n", + "}\n", + "data # data is a dictionary of dictionaries\n", + "\n", + "df = DataFrame(data)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From a list of dicts" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Alice</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Bob</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Cindy</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Dan</td>\n", + " <td>9</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "0 Alice 6\n", + "1 Bob 7\n", + "2 Cindy 8\n", + "3 Dan 9" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = [\n", + " {\"Player name\": \"Alice\", \"Score\": 6},\n", + " {\"Player name\": \"Bob\", \"Score\": 7},\n", + " {\"Player name\": \"Cindy\", \"Score\": 8},\n", + " {\"Player name\": \"Dan\", \"Score\": 9}\n", + "]\n", + "data # data is a list of dictionaries\n", + "\n", + "df = DataFrame(data)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From a list of lists\n", + "What are the column names in this version vs previous cells?" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>0</th>\n", + " <th>1</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Alice</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Bob</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Cindy</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Dan</td>\n", + " <td>9</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " 0 1\n", + "0 Alice 6\n", + "1 Bob 7\n", + "2 Cindy 8\n", + "3 Dan 9" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = [\n", + " [\"Alice\", 6],\n", + " [\"Bob\", 7],\n", + " [\"Cindy\", 8],\n", + " [\"Dan\", 9]\n", + "]\n", + "data # data is a list of lists\n", + "\n", + "df = DataFrame(data)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Explicitly naming the columns\n", + "We have to add the column names, we do this with `columns = [name1, name2, ....]`" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Alice</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Bob</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Cindy</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Dan</td>\n", + " <td>9</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "0 Alice 6\n", + "1 Bob 7\n", + "2 Cindy 8\n", + "3 Dan 9" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = [\n", + " [\"Alice\", 6],\n", + " [\"Bob\", 7],\n", + " [\"Cindy\", 8],\n", + " [\"Dan\", 9]\n", + "]\n", + "data\n", + "\n", + "df = DataFrame(data, columns=[\"Player name\", \"Score\"])\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Explicitly naming the indices\n", + "We can use `index = [name1, name2, ...]` to rename the index of each row" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>Alice</th>\n", + " <td>Alice</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Bob</th>\n", + " <td>Bob</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Cindy</th>\n", + " <td>Cindy</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Dan</th>\n", + " <td>Dan</td>\n", + " <td>9</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "Alice Alice 6\n", + "Bob Bob 7\n", + "Cindy Cindy 8\n", + "Dan Dan 9" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = [\n", + " {\"Player name\": \"Alice\", \"Score\": 6},\n", + " {\"Player name\": \"Bob\", \"Score\": 7},\n", + " {\"Player name\": \"Cindy\", \"Score\": 8},\n", + " {\"Player name\": \"Dan\", \"Score\": 9}\n", + "]\n", + "data\n", + "\n", + "df = DataFrame(data, columns=[\"Player name\", \"Score\"], index = [\"Alice\", \"Bob\", \"Cindy\", \"Dan\"])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: \n", + "# Make a DataFrame of 4 people you know with different ages\n", + "# Give names to both the columns and rows\n", + "\n", + "# Share how you did with this with your neighbor\n", + "# If you both did it the same way, try it a different way." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Select a column, row, cell, or rectangular region of a DataFrame\n", + "### Data lookup: Series\n", + "- `s.loc[X]` <- lookup by pandas index\n", + "- `s.iloc[X]` <- lookup by integer position" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Alice 6\n", + "Bob 7\n", + "Cindy 8\n", + "Dan 9\n", + "dtype: int64" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hours = Series({\"Alice\": 6, \"Bob\": 7, \"Cindy\": 8, \"Dan\": 9})\n", + "hours" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Lookup Bob's hours by pandas index.\n", + "hours.loc['Bob']" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Lookup Bob's hours by integer position.\n", + "hours.iloc[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Lookup Cindy's hours by pandas index.\n", + "hours.loc['Cindy']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data lookup: DataFrame\n", + "\n", + "\n", + "- `d.loc[r]` lookup ROW by pandas ROW index\n", + "- `d.iloc[r]` lookup ROW by ROW integer position\n", + "- `d[c]` lookup COL by pandas COL index\n", + "- `d.loc[r, c]` lookup by pandas ROW index and pandas COL index\n", + "- `d.iloc[r, c]` lookup by ROW integer position and COL integer position" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>H</th>\n", + " <td>Hope</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>P</th>\n", + " <td>Peace</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>J</th>\n", + " <td>Joy</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>L</th>\n", + " <td>Love</td>\n", + " <td>11</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "H Hope 10\n", + "P Peace 7\n", + "J Joy 4\n", + "L Love 11" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# We often call the object that we make df\n", + "data = [\n", + " [\"Hope\", 10],\n", + " [\"Peace\", 7],\n", + " [\"Joy\", 4],\n", + " [\"Love\", 11]\n", + "]\n", + "df = DataFrame(data, index = [\"H\", \"P\", \"J\", \"L\"], columns = [\"Player name\", \"Score\"])\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What are 3 different ways of accessing row L? " + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Player name Love\n", + "Score 11\n", + "Name: L, dtype: object\n", + "\n", + "Player name Love\n", + "Score 11\n", + "Name: L, dtype: object\n", + "\n", + "Player name Love\n", + "Score 11\n", + "Name: L, dtype: object\n" + ] + } + ], + "source": [ + "print(df.loc[\"L\"])\n", + "print()\n", + "print(df.iloc[3])\n", + "print()\n", + "print(df.loc[\"L\", :])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### How about accessing a column?" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "H Hope\n", + "P Peace\n", + "J Joy\n", + "L Love\n", + "Name: Player name, dtype: object\n", + "\n", + "H Hope\n", + "P Peace\n", + "J Joy\n", + "L Love\n", + "Name: Player name, dtype: object\n", + "\n", + "H Hope\n", + "P Peace\n", + "J Joy\n", + "L Love\n", + "Name: Player name, dtype: object\n" + ] + } + ], + "source": [ + "print(df['Player name'])\n", + "print()\n", + "print(df.loc[:, \"Player name\"])\n", + "print()\n", + "print(df.iloc[:, 0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What are 3 different ways to access a single cell?" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Love\n", + "\n", + "Love\n", + "\n", + "Love\n" + ] + } + ], + "source": [ + "print(df.loc[\"L\", \"Player name\"])\n", + "print()\n", + "print(df.iloc[3, 0])\n", + "print()\n", + "print(df[\"Player name\"].loc[\"L\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How to set values for a specific entry?\n", + "\n", + "- `d.loc[r, c] = new_val`\n", + "- `d.iloc[r, c] = new_val`" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>H</th>\n", + " <td>Hope</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>P</th>\n", + " <td>Peace</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>J</th>\n", + " <td>Joy</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>L</th>\n", + " <td>Luisa</td>\n", + " <td>11</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "H Hope 10\n", + "P Peace 7\n", + "J Joy 4\n", + "L Luisa 11" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#change player D's name\n", + "df.loc[\"L\", \"Player name\"] = \"Luisa\"\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>H</th>\n", + " <td>Hope</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>P</th>\n", + " <td>Peace</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>J</th>\n", + " <td>Joy</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>L</th>\n", + " <td>Luisa</td>\n", + " <td>14</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "H Hope 10\n", + "P Peace 7\n", + "J Joy 4\n", + "L Luisa 14" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# then add 3 to that player's score using .loc\n", + "df.loc[\"L\", \"Score\"] += 3\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>H</th>\n", + " <td>Hope</td>\n", + " <td>17</td>\n", + " </tr>\n", + " <tr>\n", + " <th>P</th>\n", + " <td>Peace</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>J</th>\n", + " <td>Joy</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>L</th>\n", + " <td>Luisa</td>\n", + " <td>14</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "H Hope 17\n", + "P Peace 7\n", + "J Joy 4\n", + "L Luisa 14" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# add 7 to a different player's score using .iloc\n", + "df.iloc[0, 1] += 7\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find the max score and the mean score" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "17 10.5\n" + ] + } + ], + "source": [ + "# find the max and mean of the \"Score\" column\n", + "print(df[\"Score\"].max(), df[\"Score\"].mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find the highest scoring player" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Hope'" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "max_idx = df[\"Score\"].idxmax()\n", + "\n", + "df.loc[max_idx, \"Player name\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Slicing a DataFrame\n", + "\n", + "- `df.iloc[ROW_SLICE, COL_SLICE]` <- make a rectangular slice from the DataFrame using integer positions\n", + "- `df.loc[ROW_SLICE, COL_SLICE]` <- make a rectangular slice from the DataFrame using index" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>P</th>\n", + " <td>Peace</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>J</th>\n", + " <td>Joy</td>\n", + " <td>4</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "P Peace 7\n", + "J Joy 4" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.iloc[1:3, 0:2]" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>P</th>\n", + " <td>Peace</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>J</th>\n", + " <td>Joy</td>\n", + " <td>4</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "P Peace 7\n", + "J Joy 4" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[\"P\":\"J\", \"Player name\":\"Score\"] # notice that this way is inclusive of endpoints" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set values for sliced DataFrame\n", + "\n", + "- `d.loc[ROW_SLICE, COL_SLICE] = new_val` <- set value by ROW INDEX and COL INDEX\n", + "- `d.iloc[ROW_SLICE, COL_SLICE] = new_val` <- set value by ROW Integer position and COL Integer position" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>H</th>\n", + " <td>Hope</td>\n", + " <td>17</td>\n", + " </tr>\n", + " <tr>\n", + " <th>P</th>\n", + " <td>Peace</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>J</th>\n", + " <td>Joy</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>L</th>\n", + " <td>Luisa</td>\n", + " <td>14</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "H Hope 17\n", + "P Peace 7\n", + "J Joy 4\n", + "L Luisa 14" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>H</th>\n", + " <td>Hope</td>\n", + " <td>17</td>\n", + " </tr>\n", + " <tr>\n", + " <th>P</th>\n", + " <td>Peace</td>\n", + " <td>12</td>\n", + " </tr>\n", + " <tr>\n", + " <th>J</th>\n", + " <td>Joy</td>\n", + " <td>9</td>\n", + " </tr>\n", + " <tr>\n", + " <th>L</th>\n", + " <td>Luisa</td>\n", + " <td>14</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "H Hope 17\n", + "P Peace 12\n", + "J Joy 9\n", + "L Luisa 14" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[\"P\":\"J\", \"Score\"] += 5\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Pandas allows slicing of non-contiguous columns" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "P Peace\n", + "L Luisa\n", + "Name: Player name, dtype: object" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# just get Player name for Index P and L\n", + "df.loc[[\"P\", \"L\"],\"Player name\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>H</th>\n", + " <td>Hope</td>\n", + " <td>17</td>\n", + " </tr>\n", + " <tr>\n", + " <th>P</th>\n", + " <td>Peace</td>\n", + " <td>14</td>\n", + " </tr>\n", + " <tr>\n", + " <th>J</th>\n", + " <td>Joy</td>\n", + " <td>9</td>\n", + " </tr>\n", + " <tr>\n", + " <th>L</th>\n", + " <td>Luisa</td>\n", + " <td>16</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "H Hope 17\n", + "P Peace 14\n", + "J Joy 9\n", + "L Luisa 16" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# add 2 to the people in rows P and L\n", + "df.loc[[\"P\", \"L\"],\"Score\"] += 2\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Boolean indexing on a DataFrame\n", + "\n", + "- `d[BOOL SERIES]` <- makes a new DF of all rows that lined up were True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Make a Series of Booleans based on Score >= 15" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "H True\n", + "P False\n", + "J False\n", + "L True\n", + "Name: Score, dtype: bool" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "b = df[\"Score\"] >= 15\n", + "b" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### use b to slice the DataFrame\n", + "if b is true, include this row in the new df" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>H</th>\n", + " <td>Hope</td>\n", + " <td>17</td>\n", + " </tr>\n", + " <tr>\n", + " <th>L</th>\n", + " <td>Luisa</td>\n", + " <td>16</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "H Hope 17\n", + "L Luisa 16" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df = df[b]\n", + "new_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### do the last two things in a single step" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>H</th>\n", + " <td>Hope</td>\n", + " <td>17</td>\n", + " </tr>\n", + " <tr>\n", + " <th>L</th>\n", + " <td>Luisa</td>\n", + " <td>16</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "H Hope 17\n", + "L Luisa 16" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df = df [df[\"Score\"] >= 15]\n", + "new_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating DataFrame from csv" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Index</th>\n", + " <th>Title</th>\n", + " <th>Genre</th>\n", + " <th>Director</th>\n", + " <th>Cast</th>\n", + " <th>Year</th>\n", + " <th>Runtime</th>\n", + " <th>Rating</th>\n", + " <th>Revenue</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0</td>\n", + " <td>Guardians of the Galaxy</td>\n", + " <td>Action,Adventure,Sci-Fi</td>\n", + " <td>James Gunn</td>\n", + " <td>Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...</td>\n", + " <td>2014</td>\n", + " <td>121</td>\n", + " <td>8.1</td>\n", + " <td>333.13</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>Prometheus</td>\n", + " <td>Adventure,Mystery,Sci-Fi</td>\n", + " <td>Ridley Scott</td>\n", + " <td>Noomi Rapace, Logan Marshall-Green, Michael ...</td>\n", + " <td>2012</td>\n", + " <td>124</td>\n", + " <td>7.0</td>\n", + " <td>126.46M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2</td>\n", + " <td>Split</td>\n", + " <td>Horror,Thriller</td>\n", + " <td>M. Night Shyamalan</td>\n", + " <td>James McAvoy, Anya Taylor-Joy, Haley Lu Richar...</td>\n", + " <td>2016</td>\n", + " <td>117</td>\n", + " <td>7.3</td>\n", + " <td>138.12M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>3</td>\n", + " <td>Sing</td>\n", + " <td>Animation,Comedy,Family</td>\n", + " <td>Christophe Lourdelet</td>\n", + " <td>Matthew McConaughey,Reese Witherspoon, Seth Ma...</td>\n", + " <td>2016</td>\n", + " <td>108</td>\n", + " <td>7.2</td>\n", + " <td>270.32</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>4</td>\n", + " <td>Suicide Squad</td>\n", + " <td>Action,Adventure,Fantasy</td>\n", + " <td>David Ayer</td>\n", + " <td>Will Smith, Jared Leto, Margot Robbie, Viola D...</td>\n", + " <td>2016</td>\n", + " <td>123</td>\n", + " <td>6.2</td>\n", + " <td>325.02</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1063</th>\n", + " <td>1063</td>\n", + " <td>Guardians of the Galaxy Vol. 2</td>\n", + " <td>Action, Adventure, Comedy</td>\n", + " <td>James Gunn</td>\n", + " <td>Chris Pratt, Zoe Saldana, Dave Bautista, Vin D...</td>\n", + " <td>2017</td>\n", + " <td>136</td>\n", + " <td>7.6</td>\n", + " <td>389.81</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1064</th>\n", + " <td>1064</td>\n", + " <td>Baby Driver</td>\n", + " <td>Action, Crime, Drama</td>\n", + " <td>Edgar Wright</td>\n", + " <td>Ansel Elgort, Jon Bernthal, Jon Hamm, Eiza Gon...</td>\n", + " <td>2017</td>\n", + " <td>113</td>\n", + " <td>7.6</td>\n", + " <td>107.83</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1065</th>\n", + " <td>1065</td>\n", + " <td>Only the Brave</td>\n", + " <td>Action, Biography, Drama</td>\n", + " <td>Joseph Kosinski</td>\n", + " <td>Josh Brolin, Miles Teller, Jeff Bridges, Jenni...</td>\n", + " <td>2017</td>\n", + " <td>134</td>\n", + " <td>7.6</td>\n", + " <td>18.34</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1066</th>\n", + " <td>1066</td>\n", + " <td>Incredibles 2</td>\n", + " <td>Animation, Action, Adventure</td>\n", + " <td>Brad Bird</td>\n", + " <td>Craig T. Nelson, Holly Hunter, Sarah Vowell, H...</td>\n", + " <td>2018</td>\n", + " <td>118</td>\n", + " <td>7.6</td>\n", + " <td>608.58</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1067</th>\n", + " <td>1067</td>\n", + " <td>A Star Is Born</td>\n", + " <td>Drama, Music, Romance</td>\n", + " <td>Bradley Cooper</td>\n", + " <td>Lady Gaga, Bradley Cooper, Sam Elliott, Greg G...</td>\n", + " <td>2018</td>\n", + " <td>136</td>\n", + " <td>7.6</td>\n", + " <td>215.29</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>1068 rows × 9 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Index Title Genre \\\n", + "0 0 Guardians of the Galaxy Action,Adventure,Sci-Fi \n", + "1 1 Prometheus Adventure,Mystery,Sci-Fi \n", + "2 2 Split Horror,Thriller \n", + "3 3 Sing Animation,Comedy,Family \n", + "4 4 Suicide Squad Action,Adventure,Fantasy \n", + "... ... ... ... \n", + "1063 1063 Guardians of the Galaxy Vol. 2 Action, Adventure, Comedy \n", + "1064 1064 Baby Driver Action, Crime, Drama \n", + "1065 1065 Only the Brave Action, Biography, Drama \n", + "1066 1066 Incredibles 2 Animation, Action, Adventure \n", + "1067 1067 A Star Is Born Drama, Music, Romance \n", + "\n", + " Director Cast \\\n", + "0 James Gunn Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S... \n", + "1 Ridley Scott Noomi Rapace, Logan Marshall-Green, Michael ... \n", + "2 M. Night Shyamalan James McAvoy, Anya Taylor-Joy, Haley Lu Richar... \n", + "3 Christophe Lourdelet Matthew McConaughey,Reese Witherspoon, Seth Ma... \n", + "4 David Ayer Will Smith, Jared Leto, Margot Robbie, Viola D... \n", + "... ... ... \n", + "1063 James Gunn Chris Pratt, Zoe Saldana, Dave Bautista, Vin D... \n", + "1064 Edgar Wright Ansel Elgort, Jon Bernthal, Jon Hamm, Eiza Gon... \n", + "1065 Joseph Kosinski Josh Brolin, Miles Teller, Jeff Bridges, Jenni... \n", + "1066 Brad Bird Craig T. Nelson, Holly Hunter, Sarah Vowell, H... \n", + "1067 Bradley Cooper Lady Gaga, Bradley Cooper, Sam Elliott, Greg G... \n", + "\n", + " Year Runtime Rating Revenue \n", + "0 2014 121 8.1 333.13 \n", + "1 2012 124 7.0 126.46M \n", + "2 2016 117 7.3 138.12M \n", + "3 2016 108 7.2 270.32 \n", + "4 2016 123 6.2 325.02 \n", + "... ... ... ... ... \n", + "1063 2017 136 7.6 389.81 \n", + "1064 2017 113 7.6 107.83 \n", + "1065 2017 134 7.6 18.34 \n", + "1066 2018 118 7.6 608.58 \n", + "1067 2018 136 7.6 215.29 \n", + "\n", + "[1068 rows x 9 columns]" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# it's that easy! \n", + "df = pd.read_csv(\"IMDB-Movie-Data.csv\")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View the first few lines of the DataFrame\n", + "- `.head(n)` gets the first n lines, 5 is the default" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Index</th>\n", + " <th>Title</th>\n", + " <th>Genre</th>\n", + " <th>Director</th>\n", + " <th>Cast</th>\n", + " <th>Year</th>\n", + " <th>Runtime</th>\n", + " <th>Rating</th>\n", + " <th>Revenue</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0</td>\n", + " <td>Guardians of the Galaxy</td>\n", + " <td>Action,Adventure,Sci-Fi</td>\n", + " <td>James Gunn</td>\n", + " <td>Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...</td>\n", + " <td>2014</td>\n", + " <td>121</td>\n", + " <td>8.1</td>\n", + " <td>333.13</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>Prometheus</td>\n", + " <td>Adventure,Mystery,Sci-Fi</td>\n", + " <td>Ridley Scott</td>\n", + " <td>Noomi Rapace, Logan Marshall-Green, Michael ...</td>\n", + " <td>2012</td>\n", + " <td>124</td>\n", + " <td>7.0</td>\n", + " <td>126.46M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2</td>\n", + " <td>Split</td>\n", + " <td>Horror,Thriller</td>\n", + " <td>M. Night Shyamalan</td>\n", + " <td>James McAvoy, Anya Taylor-Joy, Haley Lu Richar...</td>\n", + " <td>2016</td>\n", + " <td>117</td>\n", + " <td>7.3</td>\n", + " <td>138.12M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>3</td>\n", + " <td>Sing</td>\n", + " <td>Animation,Comedy,Family</td>\n", + " <td>Christophe Lourdelet</td>\n", + " <td>Matthew McConaughey,Reese Witherspoon, Seth Ma...</td>\n", + " <td>2016</td>\n", + " <td>108</td>\n", + " <td>7.2</td>\n", + " <td>270.32</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>4</td>\n", + " <td>Suicide Squad</td>\n", + " <td>Action,Adventure,Fantasy</td>\n", + " <td>David Ayer</td>\n", + " <td>Will Smith, Jared Leto, Margot Robbie, Viola D...</td>\n", + " <td>2016</td>\n", + " <td>123</td>\n", + " <td>6.2</td>\n", + " <td>325.02</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Index Title Genre \\\n", + "0 0 Guardians of the Galaxy Action,Adventure,Sci-Fi \n", + "1 1 Prometheus Adventure,Mystery,Sci-Fi \n", + "2 2 Split Horror,Thriller \n", + "3 3 Sing Animation,Comedy,Family \n", + "4 4 Suicide Squad Action,Adventure,Fantasy \n", + "\n", + " Director Cast \\\n", + "0 James Gunn Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S... \n", + "1 Ridley Scott Noomi Rapace, Logan Marshall-Green, Michael ... \n", + "2 M. Night Shyamalan James McAvoy, Anya Taylor-Joy, Haley Lu Richar... \n", + "3 Christophe Lourdelet Matthew McConaughey,Reese Witherspoon, Seth Ma... \n", + "4 David Ayer Will Smith, Jared Leto, Margot Robbie, Viola D... \n", + "\n", + " Year Runtime Rating Revenue \n", + "0 2014 121 8.1 333.13 \n", + "1 2012 124 7.0 126.46M \n", + "2 2016 117 7.3 138.12M \n", + "3 2016 108 7.2 270.32 \n", + "4 2016 123 6.2 325.02 " + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### get the first 2 rows" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Index</th>\n", + " <th>Title</th>\n", + " <th>Genre</th>\n", + " <th>Director</th>\n", + " <th>Cast</th>\n", + " <th>Year</th>\n", + " <th>Runtime</th>\n", + " <th>Rating</th>\n", + " <th>Revenue</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0</td>\n", + " <td>Guardians of the Galaxy</td>\n", + " <td>Action,Adventure,Sci-Fi</td>\n", + " <td>James Gunn</td>\n", + " <td>Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...</td>\n", + " <td>2014</td>\n", + " <td>121</td>\n", + " <td>8.1</td>\n", + " <td>333.13</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>Prometheus</td>\n", + " <td>Adventure,Mystery,Sci-Fi</td>\n", + " <td>Ridley Scott</td>\n", + " <td>Noomi Rapace, Logan Marshall-Green, Michael ...</td>\n", + " <td>2012</td>\n", + " <td>124</td>\n", + " <td>7.0</td>\n", + " <td>126.46M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Index Title Genre Director \\\n", + "0 0 Guardians of the Galaxy Action,Adventure,Sci-Fi James Gunn \n", + "1 1 Prometheus Adventure,Mystery,Sci-Fi Ridley Scott \n", + "\n", + " Cast Year Runtime Rating \\\n", + "0 Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S... 2014 121 8.1 \n", + "1 Noomi Rapace, Logan Marshall-Green, Michael ... 2012 124 7.0 \n", + "\n", + " Revenue \n", + "0 333.13 \n", + "1 126.46M " + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View the first few lines of the DataFrame\n", + "- `.tail(n)` gets the last n lines, 5 is the default" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Index</th>\n", + " <th>Title</th>\n", + " <th>Genre</th>\n", + " <th>Director</th>\n", + " <th>Cast</th>\n", + " <th>Year</th>\n", + " <th>Runtime</th>\n", + " <th>Rating</th>\n", + " <th>Revenue</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1063</th>\n", + " <td>1063</td>\n", + " <td>Guardians of the Galaxy Vol. 2</td>\n", + " <td>Action, Adventure, Comedy</td>\n", + " <td>James Gunn</td>\n", + " <td>Chris Pratt, Zoe Saldana, Dave Bautista, Vin D...</td>\n", + " <td>2017</td>\n", + " <td>136</td>\n", + " <td>7.6</td>\n", + " <td>389.81</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1064</th>\n", + " <td>1064</td>\n", + " <td>Baby Driver</td>\n", + " <td>Action, Crime, Drama</td>\n", + " <td>Edgar Wright</td>\n", + " <td>Ansel Elgort, Jon Bernthal, Jon Hamm, Eiza Gon...</td>\n", + " <td>2017</td>\n", + " <td>113</td>\n", + " <td>7.6</td>\n", + " <td>107.83</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1065</th>\n", + " <td>1065</td>\n", + " <td>Only the Brave</td>\n", + " <td>Action, Biography, Drama</td>\n", + " <td>Joseph Kosinski</td>\n", + " <td>Josh Brolin, Miles Teller, Jeff Bridges, Jenni...</td>\n", + " <td>2017</td>\n", + " <td>134</td>\n", + " <td>7.6</td>\n", + " <td>18.34</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1066</th>\n", + " <td>1066</td>\n", + " <td>Incredibles 2</td>\n", + " <td>Animation, Action, Adventure</td>\n", + " <td>Brad Bird</td>\n", + " <td>Craig T. Nelson, Holly Hunter, Sarah Vowell, H...</td>\n", + " <td>2018</td>\n", + " <td>118</td>\n", + " <td>7.6</td>\n", + " <td>608.58</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1067</th>\n", + " <td>1067</td>\n", + " <td>A Star Is Born</td>\n", + " <td>Drama, Music, Romance</td>\n", + " <td>Bradley Cooper</td>\n", + " <td>Lady Gaga, Bradley Cooper, Sam Elliott, Greg G...</td>\n", + " <td>2018</td>\n", + " <td>136</td>\n", + " <td>7.6</td>\n", + " <td>215.29</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Index Title Genre \\\n", + "1063 1063 Guardians of the Galaxy Vol. 2 Action, Adventure, Comedy \n", + "1064 1064 Baby Driver Action, Crime, Drama \n", + "1065 1065 Only the Brave Action, Biography, Drama \n", + "1066 1066 Incredibles 2 Animation, Action, Adventure \n", + "1067 1067 A Star Is Born Drama, Music, Romance \n", + "\n", + " Director Cast \\\n", + "1063 James Gunn Chris Pratt, Zoe Saldana, Dave Bautista, Vin D... \n", + "1064 Edgar Wright Ansel Elgort, Jon Bernthal, Jon Hamm, Eiza Gon... \n", + "1065 Joseph Kosinski Josh Brolin, Miles Teller, Jeff Bridges, Jenni... \n", + "1066 Brad Bird Craig T. Nelson, Holly Hunter, Sarah Vowell, H... \n", + "1067 Bradley Cooper Lady Gaga, Bradley Cooper, Sam Elliott, Greg G... \n", + "\n", + " Year Runtime Rating Revenue \n", + "1063 2017 136 7.6 389.81 \n", + "1064 2017 113 7.6 107.83 \n", + "1065 2017 134 7.6 18.34 \n", + "1066 2018 118 7.6 608.58 \n", + "1067 2018 136 7.6 215.29 " + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Index</th>\n", + " <th>Title</th>\n", + " <th>Genre</th>\n", + " <th>Director</th>\n", + " <th>Cast</th>\n", + " <th>Year</th>\n", + " <th>Runtime</th>\n", + " <th>Rating</th>\n", + " <th>Revenue</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1066</th>\n", + " <td>1066</td>\n", + " <td>Incredibles 2</td>\n", + " <td>Animation, Action, Adventure</td>\n", + " <td>Brad Bird</td>\n", + " <td>Craig T. Nelson, Holly Hunter, Sarah Vowell, H...</td>\n", + " <td>2018</td>\n", + " <td>118</td>\n", + " <td>7.6</td>\n", + " <td>608.58</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1067</th>\n", + " <td>1067</td>\n", + " <td>A Star Is Born</td>\n", + " <td>Drama, Music, Romance</td>\n", + " <td>Bradley Cooper</td>\n", + " <td>Lady Gaga, Bradley Cooper, Sam Elliott, Greg G...</td>\n", + " <td>2018</td>\n", + " <td>136</td>\n", + " <td>7.6</td>\n", + " <td>215.29</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Index Title Genre Director \\\n", + "1066 1066 Incredibles 2 Animation, Action, Adventure Brad Bird \n", + "1067 1067 A Star Is Born Drama, Music, Romance Bradley Cooper \n", + "\n", + " Cast Year Runtime \\\n", + "1066 Craig T. Nelson, Holly Hunter, Sarah Vowell, H... 2018 118 \n", + "1067 Lady Gaga, Bradley Cooper, Sam Elliott, Greg G... 2018 136 \n", + "\n", + " Rating Revenue \n", + "1066 7.6 608.58 \n", + "1067 7.6 215.29 " + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.tail(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What are the first and last years in our dataset?" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract Year column\n", + "years = df[\"Year\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "First year: 2006, Last year: 2020\n" + ] + } + ], + "source": [ + "print(\"First year: {}, Last year: {}\".format(years.min(), years.max()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What are the rows that correspond to movies whose title contains \"Harry\" ? \n" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Index</th>\n", + " <th>Title</th>\n", + " <th>Genre</th>\n", + " <th>Director</th>\n", + " <th>Cast</th>\n", + " <th>Year</th>\n", + " <th>Runtime</th>\n", + " <th>Rating</th>\n", + " <th>Revenue</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>114</th>\n", + " <td>114</td>\n", + " <td>Harry Potter and the Deathly Hallows: Part 2</td>\n", + " <td>Adventure,Drama,Fantasy</td>\n", + " <td>David Yates</td>\n", + " <td>Daniel Radcliffe, Emma Watson, Rupert Grint, M...</td>\n", + " <td>2011</td>\n", + " <td>130</td>\n", + " <td>8.1</td>\n", + " <td>380.96</td>\n", + " </tr>\n", + " <tr>\n", + " <th>314</th>\n", + " <td>314</td>\n", + " <td>Harry Potter and the Order of the Phoenix</td>\n", + " <td>Adventure,Family,Fantasy</td>\n", + " <td>David Yates</td>\n", + " <td>Daniel Radcliffe, Emma Watson, Rupert Grint, B...</td>\n", + " <td>2007</td>\n", + " <td>138</td>\n", + " <td>7.5</td>\n", + " <td>292</td>\n", + " </tr>\n", + " <tr>\n", + " <th>417</th>\n", + " <td>417</td>\n", + " <td>Harry Potter and the Deathly Hallows: Part 1</td>\n", + " <td>Adventure,Family,Fantasy</td>\n", + " <td>David Yates</td>\n", + " <td>Daniel Radcliffe, Emma Watson, Rupert Grint, B...</td>\n", + " <td>2010</td>\n", + " <td>146</td>\n", + " <td>7.7</td>\n", + " <td>294.98</td>\n", + " </tr>\n", + " <tr>\n", + " <th>472</th>\n", + " <td>472</td>\n", + " <td>Harry Potter and the Half-Blood Prince</td>\n", + " <td>Adventure,Family,Fantasy</td>\n", + " <td>David Yates</td>\n", + " <td>Daniel Radcliffe, Emma Watson, Rupert Grint, M...</td>\n", + " <td>2009</td>\n", + " <td>153</td>\n", + " <td>7.5</td>\n", + " <td>301.96</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Index Title \\\n", + "114 114 Harry Potter and the Deathly Hallows: Part 2 \n", + "314 314 Harry Potter and the Order of the Phoenix \n", + "417 417 Harry Potter and the Deathly Hallows: Part 1 \n", + "472 472 Harry Potter and the Half-Blood Prince \n", + "\n", + " Genre Director \\\n", + "114 Adventure,Drama,Fantasy David Yates \n", + "314 Adventure,Family,Fantasy David Yates \n", + "417 Adventure,Family,Fantasy David Yates \n", + "472 Adventure,Family,Fantasy David Yates \n", + "\n", + " Cast Year Runtime Rating \\\n", + "114 Daniel Radcliffe, Emma Watson, Rupert Grint, M... 2011 130 8.1 \n", + "314 Daniel Radcliffe, Emma Watson, Rupert Grint, B... 2007 138 7.5 \n", + "417 Daniel Radcliffe, Emma Watson, Rupert Grint, B... 2010 146 7.7 \n", + "472 Daniel Radcliffe, Emma Watson, Rupert Grint, M... 2009 153 7.5 \n", + "\n", + " Revenue \n", + "114 380.96 \n", + "314 292 \n", + "417 294.98 \n", + "472 301.96 " + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "harry_rows = df[ df[\"Title\"].str.contains(\"Harry\")]\n", + "harry_rows" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What is the movie at index 6 ? " + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'La La Land'" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.iloc[6].Title" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Notice that there are two index columns\n", + "- That happened because when you write a csv from pandas to a file, it writes a new index column\n", + "- So if the dataFrame already contains an index, you are going to get two index columns\n", + "- Let's fix that problem" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### How can you use slicing to get just columns with Title and Year?" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Title</th>\n", + " <th>Year</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Guardians of the Galaxy</td>\n", + " <td>2014</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Prometheus</td>\n", + " <td>2012</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Split</td>\n", + " <td>2016</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Sing</td>\n", + " <td>2016</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Suicide Squad</td>\n", + " <td>2016</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1063</th>\n", + " <td>Guardians of the Galaxy Vol. 2</td>\n", + " <td>2017</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1064</th>\n", + " <td>Baby Driver</td>\n", + " <td>2017</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1065</th>\n", + " <td>Only the Brave</td>\n", + " <td>2017</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1066</th>\n", + " <td>Incredibles 2</td>\n", + " <td>2018</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1067</th>\n", + " <td>A Star Is Born</td>\n", + " <td>2018</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>1068 rows × 2 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Title Year\n", + "0 Guardians of the Galaxy 2014\n", + "1 Prometheus 2012\n", + "2 Split 2016\n", + "3 Sing 2016\n", + "4 Suicide Squad 2016\n", + "... ... ...\n", + "1063 Guardians of the Galaxy Vol. 2 2017\n", + "1064 Baby Driver 2017\n", + "1065 Only the Brave 2017\n", + "1066 Incredibles 2 2018\n", + "1067 A Star Is Born 2018\n", + "\n", + "[1068 rows x 2 columns]" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2 = df[[\"Title\", \"Year\"]]\n", + "df2\n", + "# notice that this does not have the 'index' column" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### How can you use slicing to get rid of the first column?" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Title</th>\n", + " <th>Genre</th>\n", + " <th>Director</th>\n", + " <th>Cast</th>\n", + " <th>Year</th>\n", + " <th>Runtime</th>\n", + " <th>Rating</th>\n", + " <th>Revenue</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Guardians of the Galaxy</td>\n", + " <td>Action,Adventure,Sci-Fi</td>\n", + " <td>James Gunn</td>\n", + " <td>Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...</td>\n", + " <td>2014</td>\n", + " <td>121</td>\n", + " <td>8.1</td>\n", + " <td>333.13</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Prometheus</td>\n", + " <td>Adventure,Mystery,Sci-Fi</td>\n", + " <td>Ridley Scott</td>\n", + " <td>Noomi Rapace, Logan Marshall-Green, Michael ...</td>\n", + " <td>2012</td>\n", + " <td>124</td>\n", + " <td>7.0</td>\n", + " <td>126.46M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Split</td>\n", + " <td>Horror,Thriller</td>\n", + " <td>M. Night Shyamalan</td>\n", + " <td>James McAvoy, Anya Taylor-Joy, Haley Lu Richar...</td>\n", + " <td>2016</td>\n", + " <td>117</td>\n", + " <td>7.3</td>\n", + " <td>138.12M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Sing</td>\n", + " <td>Animation,Comedy,Family</td>\n", + " <td>Christophe Lourdelet</td>\n", + " <td>Matthew McConaughey,Reese Witherspoon, Seth Ma...</td>\n", + " <td>2016</td>\n", + " <td>108</td>\n", + " <td>7.2</td>\n", + " <td>270.32</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Suicide Squad</td>\n", + " <td>Action,Adventure,Fantasy</td>\n", + " <td>David Ayer</td>\n", + " <td>Will Smith, Jared Leto, Margot Robbie, Viola D...</td>\n", + " <td>2016</td>\n", + " <td>123</td>\n", + " <td>6.2</td>\n", + " <td>325.02</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1063</th>\n", + " <td>Guardians of the Galaxy Vol. 2</td>\n", + " <td>Action, Adventure, Comedy</td>\n", + " <td>James Gunn</td>\n", + " <td>Chris Pratt, Zoe Saldana, Dave Bautista, Vin D...</td>\n", + " <td>2017</td>\n", + " <td>136</td>\n", + " <td>7.6</td>\n", + " <td>389.81</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1064</th>\n", + " <td>Baby Driver</td>\n", + " <td>Action, Crime, Drama</td>\n", + " <td>Edgar Wright</td>\n", + " <td>Ansel Elgort, Jon Bernthal, Jon Hamm, Eiza Gon...</td>\n", + " <td>2017</td>\n", + " <td>113</td>\n", + " <td>7.6</td>\n", + " <td>107.83</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1065</th>\n", + " <td>Only the Brave</td>\n", + " <td>Action, Biography, Drama</td>\n", + " <td>Joseph Kosinski</td>\n", + " <td>Josh Brolin, Miles Teller, Jeff Bridges, Jenni...</td>\n", + " <td>2017</td>\n", + " <td>134</td>\n", + " <td>7.6</td>\n", + " <td>18.34</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1066</th>\n", + " <td>Incredibles 2</td>\n", + " <td>Animation, Action, Adventure</td>\n", + " <td>Brad Bird</td>\n", + " <td>Craig T. Nelson, Holly Hunter, Sarah Vowell, H...</td>\n", + " <td>2018</td>\n", + " <td>118</td>\n", + " <td>7.6</td>\n", + " <td>608.58</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1067</th>\n", + " <td>A Star Is Born</td>\n", + " <td>Drama, Music, Romance</td>\n", + " <td>Bradley Cooper</td>\n", + " <td>Lady Gaga, Bradley Cooper, Sam Elliott, Greg G...</td>\n", + " <td>2018</td>\n", + " <td>136</td>\n", + " <td>7.6</td>\n", + " <td>215.29</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>1068 rows × 8 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Title Genre \\\n", + "0 Guardians of the Galaxy Action,Adventure,Sci-Fi \n", + "1 Prometheus Adventure,Mystery,Sci-Fi \n", + "2 Split Horror,Thriller \n", + "3 Sing Animation,Comedy,Family \n", + "4 Suicide Squad Action,Adventure,Fantasy \n", + "... ... ... \n", + "1063 Guardians of the Galaxy Vol. 2 Action, Adventure, Comedy \n", + "1064 Baby Driver Action, Crime, Drama \n", + "1065 Only the Brave Action, Biography, Drama \n", + "1066 Incredibles 2 Animation, Action, Adventure \n", + "1067 A Star Is Born Drama, Music, Romance \n", + "\n", + " Director Cast \\\n", + "0 James Gunn Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S... \n", + "1 Ridley Scott Noomi Rapace, Logan Marshall-Green, Michael ... \n", + "2 M. Night Shyamalan James McAvoy, Anya Taylor-Joy, Haley Lu Richar... \n", + "3 Christophe Lourdelet Matthew McConaughey,Reese Witherspoon, Seth Ma... \n", + "4 David Ayer Will Smith, Jared Leto, Margot Robbie, Viola D... \n", + "... ... ... \n", + "1063 James Gunn Chris Pratt, Zoe Saldana, Dave Bautista, Vin D... \n", + "1064 Edgar Wright Ansel Elgort, Jon Bernthal, Jon Hamm, Eiza Gon... \n", + "1065 Joseph Kosinski Josh Brolin, Miles Teller, Jeff Bridges, Jenni... \n", + "1066 Brad Bird Craig T. Nelson, Holly Hunter, Sarah Vowell, H... \n", + "1067 Bradley Cooper Lady Gaga, Bradley Cooper, Sam Elliott, Greg G... \n", + "\n", + " Year Runtime Rating Revenue \n", + "0 2014 121 8.1 333.13 \n", + "1 2012 124 7.0 126.46M \n", + "2 2016 117 7.3 138.12M \n", + "3 2016 108 7.2 270.32 \n", + "4 2016 123 6.2 325.02 \n", + "... ... ... ... ... \n", + "1063 2017 136 7.6 389.81 \n", + "1064 2017 113 7.6 107.83 \n", + "1065 2017 134 7.6 18.34 \n", + "1066 2018 118 7.6 608.58 \n", + "1067 2018 136 7.6 215.29 \n", + "\n", + "[1068 rows x 8 columns]" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.iloc[:, 1:] #all the rows, not column 0\n", + "df\n", + " \n", + " # remember slicing is row, column" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write a df to a csv file" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(\"better_movies.csv\", index = False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Practice on your own.....Data Analysis with Data Frames\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What are all the movies that have above average run time (long movies)? " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "long_movies = ???\n", + "long_movies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Which long movie has the lowest rating?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# of these movies, what was the min rating? \n", + "min_rating = ???\n", + "min_rating" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Which movies had this min rating?\n", + "???" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What are all long movies with someone in the cast named \"Emma\" ? " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "???" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What is the title of the shortest movie?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "???" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What movie had the highest revenue?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"Revnue\"].max() # does not work, Why?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We need to clean our data\n", + "# Some movies have M at the end and others don't.\n", + "# All revenues are in millions of dollars.\n", + "def format_revenue(revenue):\n", + " \"\"\" \n", + " Checks the last character of the string and formats accordingly\n", + " \"\"\"\n", + " if type(revenue) == float: # need this in here if we run code multiple times\n", + " return revenue\n", + " elif revenue[-1] == 'M': # some have an \"M\" at the end\n", + " return ??? # TODO: convert relevant part of the string to float and multiple by 1e6\n", + " else:\n", + " return ??? # TODO: convert to float and multiple by 1e6" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# What movie had the highest revenue?\n", + "revenue = df[\"Revenue\"].apply(format_revenue) # apply a function to a column; returns a Series\n", + "print(revenue.head())\n", + "max_revenue = revenue.max()\n", + "\n", + "# make a copy of our df\n", + "rev_df = df.copy()\n", + "rev_df[\"Revenue (float)\"] = revenue\n", + "rev_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Now we can answer the question!\n", + "???" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Or more generally...\n", + "rev_df.sort_values(by = \"Revenue (float)\", ascending = False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What is the average runtime for movies by \"Francis Lawrence\"?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### More complicated questions..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Which director had the highest average rating? \n", + "\n", + "# one way is to make a python dict of director, list of ratings\n", + "director_dict = dict()\n", + "\n", + "# make the dictionary: key is director, value is list of ratings\n", + "for i in range(len(df)):\n", + " director = df.loc[i, \"Director\"]\n", + " rating = df.loc[i, \"Rating\"]\n", + " #print(i, director, rating)\n", + " if director not in director_dict:\n", + " director_dict[director] = []\n", + " director_dict[director].append(rating)\n", + "\n", + "# make a ratings dict key is directory, value is average\n", + "# only include directors with > 4 movies\n", + "ratings_dict = {k:sum(v)/len(v) for (k,v) in director_dict.items() if len(v) > 4}\n", + "\n", + "#sort a dict by values\n", + "dict(sorted(ratings_dict.items(), key=lambda t:t[-1], reverse=True))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# FOR DEMONSTRATION PURPOSES ONLY\n", + "# We haven't (and will not) learn about \"groupby\"\n", + "# Pandas has many operations which will be helpful!\n", + "\n", + "# Consider what you already know, and what Pandas can solve\n", + "# when formulating your solutions.\n", + "rating_groups = df.groupby(\"Director\")[\"Rating\"]\n", + "rating_groups.mean()[rating_groups.count() > 4].sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extra Practice: Make up some of your own questions about the movies" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/sum23/lecture_materials/17_Pandas/lec_17_pandas2_dataframe_template.ipynb b/sum23/lecture_materials/17_Pandas/lec_17_pandas2_dataframe_template.ipynb index b775fd1c48bd7b4c55d93339a8a3944dae31e62d..cb438bdfc72dee05e390f0ebd38da79f61e33e00 100644 --- a/sum23/lecture_materials/17_Pandas/lec_17_pandas2_dataframe_template.ipynb +++ b/sum23/lecture_materials/17_Pandas/lec_17_pandas2_dataframe_template.ipynb @@ -2,13 +2,14 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pandas import Series, DataFrame\n", - "# We can explictly import Series and DataFrame, why might we do this?" + "# We can explictly import Series and DataFrame, why might we do this?\n", + "# we're lazy: can write Series instead of pd.Series" ] }, { @@ -27,16 +28,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 54\n", + "1 22\n", + "2 19\n", + "3 73\n", + "4 80\n", + "dtype: int64" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "scores_list = [54, 22, 19, 73, 80]\n", "scores_series = Series(scores_list)\n", "scores_series\n", "\n", - "# What is the terminology for: 0, 1, 2, ... ?? A: \n", - "# What is the terminology for: 54, 22, 19, .... ?? A: " + "# What is the terminology for: 0, 1, 2, ... ?? A: index\n", + "# What is the terminology for: 54, 22, 19, .... ?? A: value" ] }, { @@ -49,10 +66,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "0 54\n", + "3 73\n", + "4 80\n", + "dtype: int64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores_series[scores_series > 50]" + ] }, { "cell_type": "markdown", @@ -63,9 +96,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 54\n", + "1 22\n", + "4 80\n", + "dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "scores_series[[True, True, False, False, True]] # often called a \"mask\"" ] @@ -79,10 +126,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "0 54\n", + "3 73\n", + "4 80\n", + "dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mask = scores_series > 50\n", + "scores_series[mask]" + ] }, { "cell_type": "markdown", @@ -93,9 +157,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rita 5\n", + "Therese 3\n", + "Janice 6\n", + "dtype: int64\n", + "Rita 3\n", + "Therese 7\n", + "Janice 4\n", + "dtype: int64\n", + "Therese 5\n", + "Janice 5\n", + "Rita 8\n", + "dtype: int64\n" + ] + } + ], "source": [ "# Imagine we hire students and track their weekly hours\n", "week1 = Series({\"Rita\":5, \"Therese\":3, \"Janice\": 6})\n", @@ -115,11 +198,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Rita 8\n", + "Therese 6\n", + "Janice 9\n", + "dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "\n", + "week1 = week1 + 3\n", "week1" ] }, @@ -132,11 +229,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "55" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "total_hours = ???\n", + "total_hours = week1.sum() + week2.sum() + week3.sum()\n", "total_hours" ] }, @@ -149,11 +257,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Janice 1.8\n", + "Rita 1.0\n", + "Therese 1.2\n", + "dtype: float64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "???\n", + "week1 / week3\n", "# Notice that we didn't have to worry about the order of indices" ] }, @@ -166,13 +288,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rita 8\n", + "Therese 6\n", + "Janice 9\n", + "dtype: int64\n", + "Rita 3\n", + "Therese 7\n", + "Janice 4\n", + "dtype: int64\n", + "Rita True\n", + "Therese False\n", + "Janice True\n", + "dtype: bool\n" + ] + } + ], "source": [ "print(week1)\n", "print(week2)\n", - "???\n", + "print(week1 > week2)\n", "# Notice that indices are ordered the same" ] }, @@ -185,15 +326,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rita 8\n", + "Therese 6\n", + "Janice 9\n", + "dtype: int64\n", + "Therese 5\n", + "Janice 5\n", + "Rita 8\n", + "dtype: int64\n", + "Janice True\n", + "Rita True\n", + "Therese False\n", + "dtype: bool\n" + ] + } + ], "source": [ "print(week1)\n", "print(week3)\n", - "??? # Does it work?\n", + "# print(week1 > week3) # Does it work?\n", "\n", - "# How can we fix this?" + "# How can we fix this?\n", + "print(week1.sort_index() > week2.sort_index())" ] }, { @@ -237,7 +398,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -256,15 +417,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "name_list = [\"Alice\", \"Bob\", \"Cindy\", \"Dan\"]\n", "score_list = [6, 7, 8, 9]\n", "\n", - "# this is the same as above, reminding us that Series act like lists\n", - "\n" + "# this is the same as above, reminding us that Series act like lists\n" ] }, { @@ -277,15 +437,77 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>name</th>\n", + " <th>scores</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Alice</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Bob</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Cindy</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Dan</td>\n", + " <td>9</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " name scores\n", + "0 Alice 6\n", + "1 Bob 7\n", + "2 Cindy 8\n", + "3 Dan 9" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data = {\n", - " \"Player name\": {0: \"Alice\", 1: \"Bob\", 2: \"Cindy\", 3: \"Dan\"},\n", - " \"Score\": {0: 6, 1: 7, 2: 8, 3: 9}\n", - "}\n", - "data" + " \"name\": {0: \"Alice\", 1: \"Bob\", 2: \"Cindy\", 3: \"Dan\"},\n", + " \"scores\": {0: 6, 1: 7, 2: 8, 3: 9}\n", + "}\n" ] }, { @@ -297,17 +519,79 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Alice</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Bob</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Cindy</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Dan</td>\n", + " <td>9</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "0 Alice 6\n", + "1 Bob 7\n", + "2 Cindy 8\n", + "3 Dan 9" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data = [\n", " {\"Player name\": \"Alice\", \"Score\": 6},\n", " {\"Player name\": \"Bob\", \"Score\": 7},\n", " {\"Player name\": \"Cindy\", \"Score\": 8},\n", " {\"Player name\": \"Dan\", \"Score\": 9}\n", - "]\n", - "data" + "]\n" ] }, { @@ -319,9 +603,72 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>0</th>\n", + " <th>1</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Alice</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Bob</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Cindy</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Dan</td>\n", + " <td>9</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " 0 1\n", + "0 Alice 6\n", + "1 Bob 7\n", + "2 Cindy 8\n", + "3 Dan 9" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data = [\n", " [\"Alice\", 6],\n", @@ -342,9 +689,72 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Alice</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Bob</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Cindy</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Dan</td>\n", + " <td>9</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Name Score\n", + "0 Alice 6\n", + "1 Bob 7\n", + "2 Cindy 8\n", + "3 Dan 9" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data = [\n", " [\"Alice\", 6],\n", @@ -352,7 +762,7 @@ " [\"Cindy\", 8],\n", " [\"Dan\", 9]\n", "]\n", - "data" + "data\n" ] }, { @@ -365,9 +775,72 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>Alice</th>\n", + " <td>Alice</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Bob</th>\n", + " <td>Bob</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Cindy</th>\n", + " <td>Cindy</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Dan</th>\n", + " <td>Dan</td>\n", + " <td>9</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "Alice Alice 6\n", + "Bob Bob 7\n", + "Cindy Cindy 8\n", + "Dan Dan 9" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data = [\n", " {\"Player name\": \"Alice\", \"Score\": 6},\n", @@ -375,7 +848,7 @@ " {\"Player name\": \"Cindy\", \"Score\": 8},\n", " {\"Player name\": \"Dan\", \"Score\": 9}\n", "]\n", - "data" + "data\n" ] }, { @@ -404,9 +877,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Alice 6\n", + "Bob 7\n", + "Cindy 8\n", + "Dan 9\n", + "dtype: int64" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "hours = Series({\"Alice\": 6, \"Bob\": 7, \"Cindy\": 8, \"Dan\": 9})\n", "hours" @@ -414,27 +902,60 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "7" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Lookup Bob's hours by pandas index.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "7" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Lookup Bob's hours by integer position.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "8" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Lookup Cindy's hours by pandas index.\n" ] @@ -455,9 +976,72 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>H</th>\n", + " <td>Hope</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>P</th>\n", + " <td>Peace</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>J</th>\n", + " <td>Joy</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>L</th>\n", + " <td>Love</td>\n", + " <td>11</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "H Hope 10\n", + "P Peace 7\n", + "J Joy 4\n", + "L Love 11" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# We often call the object that we make df\n", "data = [\n", @@ -479,9 +1063,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Player name Love\n", + "Score 11\n", + "Name: L, dtype: object" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Player name Love\n", + "Score 11\n", + "Name: L, dtype: object" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Player name Love\n", + "Score 11\n", + "Name: L, dtype: object" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [] }, { @@ -493,18 +1130,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": {}, - "outputs": [], - "source": [ - "df" - ] + "outputs": [ + { + "data": { + "text/plain": [ + "H 10\n", + "P 7\n", + "J 4\n", + "L 11\n", + "Name: Score, dtype: int64" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "H 10\n", + "P 7\n", + "J 4\n", + "L 11\n", + "Name: Score, dtype: int64" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [] }, { @@ -516,18 +1181,56 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": {}, - "outputs": [], - "source": [ - "df" - ] + "outputs": [ + { + "data": { + "text/plain": [ + "11" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "10" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [] }, { @@ -542,29 +1245,218 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#change player D's name\n", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>H</th>\n", + " <td>Hope</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>P</th>\n", + " <td>Peace</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>J</th>\n", + " <td>Joy</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>L</th>\n", + " <td>Luisa</td>\n", + " <td>11</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "H Hope 10\n", + "P Peace 7\n", + "J Joy 4\n", + "L Luisa 11" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#change player L's name\n", "df.loc[\"L\", \"Player name\"] = \"Luisa\"\n", "df" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>H</th>\n", + " <td>Hope</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>P</th>\n", + " <td>Peace</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>J</th>\n", + " <td>Joy</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>L</th>\n", + " <td>Luisa</td>\n", + " <td>14</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "H Hope 10\n", + "P Peace 7\n", + "J Joy 4\n", + "L Luisa 14" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# then add 3 to that player's score using .loc\n" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>H</th>\n", + " <td>Hope</td>\n", + " <td>17</td>\n", + " </tr>\n", + " <tr>\n", + " <th>P</th>\n", + " <td>Peace</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>J</th>\n", + " <td>Joy</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>L</th>\n", + " <td>Luisa</td>\n", + " <td>14</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "H Hope 17\n", + "P Peace 7\n", + "J Joy 4\n", + "L Luisa 14" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# add 7 to a different player's score using .iloc\n" ] @@ -578,9 +1470,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 53, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "17 10.5\n" + ] + } + ], "source": [ "# find the max and mean of the \"Score\" column\n", "print(df[\"Score\"].max(), df[\"Score\"].mean())" @@ -595,9 +1495,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'Hope'" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [] }, { @@ -612,18 +1523,120 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>P</th>\n", + " <td>Peace</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>J</th>\n", + " <td>Joy</td>\n", + " <td>4</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "P Peace 7\n", + "J Joy 4" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.iloc[1:3, 0:2]" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Player name</th>\n", + " <th>Score</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>P</th>\n", + " <td>Peace</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>J</th>\n", + " <td>Joy</td>\n", + " <td>4</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Player name Score\n", + "P Peace 7\n", + "J Joy 4" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df.loc[\"P\":\"J\", \"Player name\":\"Score\"] # notice that this way is inclusive of endpoints" ] @@ -1200,7 +2213,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.10.6" } }, "nbformat": 4,