diff --git a/s24/Louis_Lecture_Notes/36_Database3/Lec36_WorksheetAnswers.ipynb b/s24/Louis_Lecture_Notes/36_Database3/Lec36_WorksheetAnswers.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..13e8e827bd6955ce327360ff0bcb657e5e8a4b83 --- /dev/null +++ b/s24/Louis_Lecture_Notes/36_Database3/Lec36_WorksheetAnswers.ipynb @@ -0,0 +1,3316 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<style>em { color: red; }</style> <style>.container {width:100% !important; }</style>" + ], + "text/plain": [ + "<IPython.core.display.HTML object>" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# ignore this cell - it makes the emphasized text red and uses the full width of the screen\n", + "from IPython.core.display import HTML\n", + "HTML('<style>em { color: red; }</style> <style>.container {width:100% !important; }</style>')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import sqlite3\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "conn = sqlite3.connect(\"worksheet.db\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# this function gives us a shortcut to making queries\n", + "# instead of typing all that code over and over again, we just call qry with our SQL\n", + "# it assumes we have access to a connection object, conn\n", + "\n", + "def qry(QUERY):\n", + " '''QUERY is a string containing SQL, conn is a global connection variable'''\n", + " return pd.read_sql(QUERY, conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>type</th>\n", + " <th>name</th>\n", + " <th>tbl_name</th>\n", + " <th>rootpage</th>\n", + " <th>sql</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>table</td>\n", + " <td>hydrants</td>\n", + " <td>hydrants</td>\n", + " <td>2</td>\n", + " <td>CREATE TABLE \"hydrants\" (\\n\"year\" INTEGER,\\n ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>table</td>\n", + " <td>trees</td>\n", + " <td>trees</td>\n", + " <td>3</td>\n", + " <td>CREATE TABLE \"trees\" (\\n\"tree\" TEXT,\\n \"x\" IN...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>table</td>\n", + " <td>species</td>\n", + " <td>species</td>\n", + " <td>4</td>\n", + " <td>CREATE TABLE \"species\" (\\n\"code\" TEXT,\\n \"spe...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " type name tbl_name rootpage \\\n", + "0 table hydrants hydrants 2 \n", + "1 table trees trees 3 \n", + "2 table species species 4 \n", + "\n", + " sql \n", + "0 CREATE TABLE \"hydrants\" (\\n\"year\" INTEGER,\\n ... \n", + "1 CREATE TABLE \"trees\" (\\n\"tree\" TEXT,\\n \"x\" IN... \n", + "2 CREATE TABLE \"species\" (\\n\"code\" TEXT,\\n \"spe... " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.read_sql(\"SELECT * from sqlite_master\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Remember that one database can hold several tables\n", + "hydrants = qry(\"SELECT * FROM hydrants\")\n", + "trees = qry(\"SELECT * FROM trees\")\n", + "species = qry(\"SELECT * FROM species\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>tree</th>\n", + " <th>x</th>\n", + " <th>y</th>\n", + " <th>species</th>\n", + " <th>diameter</th>\n", + " <th>priority</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>A</td>\n", + " <td>10</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>8</td>\n", + " <td>71</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>B</td>\n", + " <td>20</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>10</td>\n", + " <td>100</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>C</td>\n", + " <td>30</td>\n", + " <td>4</td>\n", + " <td>p</td>\n", + " <td>6</td>\n", + " <td>30</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>D</td>\n", + " <td>40</td>\n", + " <td>4</td>\n", + " <td>p</td>\n", + " <td>8</td>\n", + " <td>40</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>E</td>\n", + " <td>50</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>12</td>\n", + " <td>99</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " tree x y species diameter priority\n", + "0 A 10 4 m 8 71\n", + "1 B 20 4 m 10 100\n", + "2 C 30 4 p 6 30\n", + "3 D 40 4 p 8 40\n", + "4 E 50 4 m 12 99" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# this is made-up data, but is inspired by an actual City of Madison database!\n", + "trees" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://data-cityofmadison.opendata.arcgis.com/datasets/b700541a20e446839b18d62426c266a3/explore?location=43.072110%2C-89.405159%2C18.00" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>code</th>\n", + " <th>species</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>m</td>\n", + " <td>maple</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>p</td>\n", + " <td>pine</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " code species\n", + "0 m maple\n", + "1 p pine" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Databases typically split up data into manageable pieces\n", + "# It may be more efficient to keep the species codes separate, since they are rarely updated\n", + "species" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>year</th>\n", + " <th>color</th>\n", + " <th>style</th>\n", + " <th>owner</th>\n", + " <th>alt</th>\n", + " <th>active</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1999</td>\n", + " <td>red</td>\n", + " <td>K-81</td>\n", + " <td>private</td>\n", + " <td>1179</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>2000</td>\n", + " <td>red</td>\n", + " <td>M-3</td>\n", + " <td>public</td>\n", + " <td>1065</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2001</td>\n", + " <td>green</td>\n", + " <td>Pacer</td>\n", + " <td>private</td>\n", + " <td>1058</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>2010</td>\n", + " <td>blue</td>\n", + " <td>Pacer</td>\n", + " <td>public</td>\n", + " <td>1081</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>2014</td>\n", + " <td>blue</td>\n", + " <td>Pacer</td>\n", + " <td>public</td>\n", + " <td>1052</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>2018</td>\n", + " <td>blue</td>\n", + " <td>Pacer</td>\n", + " <td>public</td>\n", + " <td>1109</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " year color style owner alt active\n", + "0 1999 red K-81 private 1179 0\n", + "1 2000 red M-3 public 1065 0\n", + "2 2001 green Pacer private 1058 1\n", + "3 2010 blue Pacer public 1081 1\n", + "4 2014 blue Pacer public 1052 1\n", + "5 2018 blue Pacer public 1109 1" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The City of Madison keeps data on fire hydrants!\n", + "hydrants" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://data-cityofmadison.opendata.arcgis.com/datasets/54c4877f16084409849ebd5385e2ee27_6/explore?location=43.071084%2C-89.403280%2C17.00" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1a. *Without* running this cell - *predict* the output of the following statement" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>tree</th>\n", + " <th>x</th>\n", + " <th>y</th>\n", + " <th>species</th>\n", + " <th>diameter</th>\n", + " <th>priority</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>B</td>\n", + " <td>20</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>10</td>\n", + " <td>100</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>E</td>\n", + " <td>50</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>12</td>\n", + " <td>99</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " tree x y species diameter priority\n", + "1 B 20 4 m 10 100\n", + "4 E 50 4 m 12 99" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trees[trees[\"priority\"] > 90] " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>x</th>\n", + " <th>y</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>20</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>50</td>\n", + " <td>4</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " x y\n", + "1 20 4\n", + "4 50 4" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trees[trees[\"priority\"] > 90] [[\"x\", \"y\"]] # show only the columns in this list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1b. *Convert* the statement to an equivalent *SQL* query." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>tree</th>\n", + " <th>x</th>\n", + " <th>y</th>\n", + " <th>species</th>\n", + " <th>diameter</th>\n", + " <th>priority</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>A</td>\n", + " <td>10</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>8</td>\n", + " <td>71</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>B</td>\n", + " <td>20</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>10</td>\n", + " <td>100</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>C</td>\n", + " <td>30</td>\n", + " <td>4</td>\n", + " <td>p</td>\n", + " <td>6</td>\n", + " <td>30</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>D</td>\n", + " <td>40</td>\n", + " <td>4</td>\n", + " <td>p</td>\n", + " <td>8</td>\n", + " <td>40</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>E</td>\n", + " <td>50</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>12</td>\n", + " <td>99</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " tree x y species diameter priority\n", + "0 A 10 4 m 8 71\n", + "1 B 20 4 m 10 100\n", + "2 C 30 4 p 6 30\n", + "3 D 40 4 p 8 40\n", + "4 E 50 4 m 12 99" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trees" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>x</th>\n", + " <th>y</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>20</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>50</td>\n", + " <td>4</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " x y\n", + "0 20 4\n", + "1 50 4" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qry(\"select x,y from trees where priority > 90\") " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "### 2a. *Predict* the output of the following *SQL* query" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>tree</th>\n", + " <th>x</th>\n", + " <th>y</th>\n", + " <th>species</th>\n", + " <th>diameter</th>\n", + " <th>priority</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>A</td>\n", + " <td>10</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>8</td>\n", + " <td>71</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>B</td>\n", + " <td>20</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>10</td>\n", + " <td>100</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>C</td>\n", + " <td>30</td>\n", + " <td>4</td>\n", + " <td>p</td>\n", + " <td>6</td>\n", + " <td>30</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>D</td>\n", + " <td>40</td>\n", + " <td>4</td>\n", + " <td>p</td>\n", + " <td>8</td>\n", + " <td>40</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>E</td>\n", + " <td>50</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>12</td>\n", + " <td>99</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " tree x y species diameter priority\n", + "0 A 10 4 m 8 71\n", + "1 B 20 4 m 10 100\n", + "2 C 30 4 p 6 30\n", + "3 D 40 4 p 8 40\n", + "4 E 50 4 m 12 99" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trees" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>x+y</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>14</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>54</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " x+y\n", + "0 14\n", + "1 24\n", + "2 54" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qry(\"SELECT x+y FROM trees WHERE species = 'm'\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2b. *Convert* the query into an equivalent *pandas* statement." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 10\n", + "1 20\n", + "2 30\n", + "3 40\n", + "4 50\n", + "Name: x, dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Series\n", + "trees[\"x\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 10\n", + "1 20\n", + "4 50\n", + "Name: x, dtype: int64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Series with Boolean indexing applied\n", + "trees[\"x\"] [trees[\"species\"] == 'm']" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 4\n", + "1 4\n", + "4 4\n", + "Name: y, dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trees[\"y\"][trees[\"species\"] == 'm']" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 14\n", + "1 24\n", + "4 54\n", + "dtype: int64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# because the two Series have matching indices, we can add them\n", + "# this answer is acceptable on a quiz/exam\n", + "trees[\"x\"][trees[\"species\"] == 'm'] + trees[\"y\"][trees[\"species\"] == 'm']" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>x+y</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>14</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>54</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " x+y\n", + "0 14\n", + "1 24\n", + "4 54" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# if you want to get fancy, you can turn a Series into a DataFrame and add column names\n", + "result2 = pd.DataFrame(trees[\"x\"][trees[\"species\"] == 'm'] + trees[\"y\"][trees[\"species\"] == 'm'])\n", + "result2.columns = [\"x+y\"]\n", + "result2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "### 3a. *Predict* the output of the following *pandas* statements" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>code</th>\n", + " <th>species</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>m</td>\n", + " <td>maple</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>p</td>\n", + " <td>pine</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " code species\n", + "0 m maple\n", + "1 p pine" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "species" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# species[\"code\"] [species[\"species\"]==\"maple\"] .iloc[0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 m\n", + "1 p\n", + "Name: code, dtype: object" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# this is a Series\n", + "species[\"code\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 m\n", + "Name: code, dtype: object" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Series with Boolean indexing applied\n", + "species[\"code\"] [species[\"species\"]==\"maple\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'m'" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Series with Boolean indexing applied get the value at integer location 0\n", + "species[\"code\"] [species[\"species\"]==\"maple\"] .iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'m'" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cd = species[\"code\"][species[\"species\"]==\"maple\"].iloc[0]\n", + "cd" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>tree</th>\n", + " <th>x</th>\n", + " <th>y</th>\n", + " <th>species</th>\n", + " <th>diameter</th>\n", + " <th>priority</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>A</td>\n", + " <td>10</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>8</td>\n", + " <td>71</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>B</td>\n", + " <td>20</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>10</td>\n", + " <td>100</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>E</td>\n", + " <td>50</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>12</td>\n", + " <td>99</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " tree x y species diameter priority\n", + "0 A 10 4 m 8 71\n", + "1 B 20 4 m 10 100\n", + "4 E 50 4 m 12 99" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# DataFrame with Boolean Indexing\n", + "trees [trees[\"species\"] == cd]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 A\n", + "1 B\n", + "4 E\n", + "Name: tree, dtype: object" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# DataFrame with Boolean Indexing with column selection\n", + "trees [trees[\"species\"] == cd] ['tree']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3b. *Convert* the statements into an equivalent *SQL* query." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>code</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>m</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " code\n", + "0 m" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qry(\"select code from species where species = 'maple' \") " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 m\n", + "Name: code, dtype: object" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# DataFrame with column selection\n", + "qry(\"select code from species where species = 'maple' \") ['code']" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'m'" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# DataFrame with column selection get the value at iloc 0\n", + "cd = qry(\"select code from species where species = 'maple' \") ['code'] .iloc[0]\n", + "cd\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>tree</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>A</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>B</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>E</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " tree\n", + "0 A\n", + "1 B\n", + "2 E" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# hard coding \n", + "qry(\"select tree from trees where species = 'm'\" ) " + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>tree</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>A</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>B</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>E</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " tree\n", + "0 A\n", + "1 B\n", + "2 E" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# not hard coding\n", + "qry(\"select tree from trees where species = '{}'\".format(cd))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "### 4a. *Predict* the output of the following query" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>species</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>m</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>m</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>m</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>p</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>p</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " species\n", + "0 m\n", + "1 m\n", + "2 m\n", + "3 p\n", + "4 p" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qry(\"SELECT species FROM trees ORDER BY priority DESC\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.b *Convert* the query code to *Pandas*" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 m\n", + "4 m\n", + "0 m\n", + "3 p\n", + "2 p\n", + "Name: species, dtype: object" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# DataFrame soted by priority # with column selection\n", + "trees.sort_values(\"priority\", ascending = False) [\"species\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "### 5a. *Predict* the output of the following code" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>tree</th>\n", + " <th>x</th>\n", + " <th>y</th>\n", + " <th>species</th>\n", + " <th>diameter</th>\n", + " <th>priority</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>A</td>\n", + " <td>10</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>8</td>\n", + " <td>71</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>B</td>\n", + " <td>20</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>10</td>\n", + " <td>100</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>C</td>\n", + " <td>30</td>\n", + " <td>4</td>\n", + " <td>p</td>\n", + " <td>6</td>\n", + " <td>30</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>D</td>\n", + " <td>40</td>\n", + " <td>4</td>\n", + " <td>p</td>\n", + " <td>8</td>\n", + " <td>40</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>E</td>\n", + " <td>50</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>12</td>\n", + " <td>99</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " tree x y species diameter priority\n", + "0 A 10 4 m 8 71\n", + "1 B 20 4 m 10 100\n", + "2 C 30 4 p 6 30\n", + "3 D 40 4 p 8 40\n", + "4 E 50 4 m 12 99" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trees" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "# list(qry(\"SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1\").iloc[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>tree</th>\n", + " <th>priority</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>B</td>\n", + " <td>100</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " tree priority\n", + "0 B 100" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qry(\"SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tree B\n", + "priority 100\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qry(\"SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1\").iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['B', 100]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# list gets the values only\n", + "list(qry(\"SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1\").iloc[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.b *Convert* the above code to *Pandas*" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>tree</th>\n", + " <th>x</th>\n", + " <th>y</th>\n", + " <th>species</th>\n", + " <th>diameter</th>\n", + " <th>priority</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>B</td>\n", + " <td>20</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>10</td>\n", + " <td>100</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>E</td>\n", + " <td>50</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>12</td>\n", + " <td>99</td>\n", + " </tr>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>A</td>\n", + " <td>10</td>\n", + " <td>4</td>\n", + " <td>m</td>\n", + " <td>8</td>\n", + " <td>71</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>D</td>\n", + " <td>40</td>\n", + " <td>4</td>\n", + " <td>p</td>\n", + " <td>8</td>\n", + " <td>40</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>C</td>\n", + " <td>30</td>\n", + " <td>4</td>\n", + " <td>p</td>\n", + " <td>6</td>\n", + " <td>30</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " tree x y species diameter priority\n", + "1 B 20 4 m 10 100\n", + "4 E 50 4 m 12 99\n", + "0 A 10 4 m 8 71\n", + "3 D 40 4 p 8 40\n", + "2 C 30 4 p 6 30" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trees.sort_values(\"priority\", ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tree B\n", + "x 20\n", + "y 4\n", + "species m\n", + "diameter 10\n", + "priority 100\n", + "Name: 1, dtype: object" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# DataFrame # grab the first row\n", + "trees.sort_values(\"priority\", ascending=False) .iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tree B\n", + "priority 100\n", + "Name: 1, dtype: object" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# DataFrame # grab the first row #slicing by certain indices\n", + "trees.sort_values(\"priority\", ascending=False) .iloc[0] [['tree', 'priority']]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['B', 100]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# list gets the values only\n", + "list(trees.sort_values(\"priority\", ascending=False).iloc[0] [['tree', 'priority']] )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "### 6a. *Predict* the output of the following code" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "# qry(\"\"\"SELECT COUNT(SPECIES) AS c1,\n", + "# COUNT(DISTINCT SPECIES) as c2\n", + "# FROM trees\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>c1</th>\n", + " <th>c2</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>5</td>\n", + " <td>2</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " c1 c2\n", + "0 5 2" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qry(\"\"\"SELECT COUNT(SPECIES) AS c1,\n", + " COUNT(DISTINCT SPECIES) as c2\n", + " FROM trees\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6b. *Convert* the above code to *Pandas*" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1 = len(trees)\n", + "c1" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "m 3\n", + "p 2\n", + "Name: species, dtype: int64" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trees[\"species\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2 = len(trees[\"species\"].value_counts())\n", + "c2" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[5, 2]" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# this answer is acceptable\n", + "[c1, c2]" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>c1</th>\n", + " <th>c2</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>5</td>\n", + " <td>2</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " c1 c2\n", + "0 5 2" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# A dataframe can be made from a dict of lists\n", + "d = {\"c1\":[c1], \"c2\":[c2]}\n", + "pd.DataFrame(d)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "### 7a. *Predict* the output of the following code" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "# qry(\"\"\"SELECT species, COUNT(SPECIES) AS count,\n", + "# AVG(diameter) AS size\n", + "# FROM trees\n", + "# GROUP BY species ORDER BY count DESC\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>species</th>\n", + " <th>count</th>\n", + " <th>size</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>m</td>\n", + " <td>3</td>\n", + " <td>10.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>p</td>\n", + " <td>2</td>\n", + " <td>7.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " species count size\n", + "0 m 3 10.0\n", + "1 p 2 7.0" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qry(\"\"\"SELECT species, COUNT(SPECIES) AS count,\n", + " AVG(diameter) AS size\n", + " FROM trees\n", + " GROUP BY species ORDER BY count DESC\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7b. *Convert* the above code to *Pandas*" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['m', 'p']" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# part 1: species\n", + "species_list = list(pd.unique(trees['species']))\n", + "species_list" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>x</th>\n", + " <th>y</th>\n", + " <th>diameter</th>\n", + " <th>priority</th>\n", + " </tr>\n", + " <tr>\n", + " <th>species</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>m</th>\n", + " <td>26.666667</td>\n", + " <td>4.0</td>\n", + " <td>10.0</td>\n", + " <td>90.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>p</th>\n", + " <td>35.000000</td>\n", + " <td>4.0</td>\n", + " <td>7.0</td>\n", + " <td>35.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " x y diameter priority\n", + "species \n", + "m 26.666667 4.0 10.0 90.0\n", + "p 35.000000 4.0 7.0 35.0" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trees.groupby(\"species\").mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[10.0, 7.0]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# part 2: size\n", + "size_list = list(trees.groupby(\"species\").mean()[\"diameter\"]) \n", + "size_list" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[3, 2]" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# part 3: counts\n", + "count_list = list(trees['species'].value_counts())\n", + "count_list" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>species</th>\n", + " <th>count</th>\n", + " <th>size</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>m</td>\n", + " <td>3</td>\n", + " <td>10.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>p</td>\n", + " <td>2</td>\n", + " <td>7.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " species count size\n", + "0 m 3 10.0\n", + "1 p 2 7.0" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# part 4: make a DataFrame from a dict of lists\n", + "pd.DataFrame({\"species\": species_list,\n", + " \"count\": count_list,\n", + " \"size\": size_list})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "# Additional Exercises: \n", + "### *Predict* the output of the following statements" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>year</th>\n", + " <th>color</th>\n", + " <th>style</th>\n", + " <th>owner</th>\n", + " <th>alt</th>\n", + " <th>active</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1999</td>\n", + " <td>red</td>\n", + " <td>K-81</td>\n", + " <td>private</td>\n", + " <td>1179</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>2000</td>\n", + " <td>red</td>\n", + " <td>M-3</td>\n", + " <td>public</td>\n", + " <td>1065</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2001</td>\n", + " <td>green</td>\n", + " <td>Pacer</td>\n", + " <td>private</td>\n", + " <td>1058</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>2010</td>\n", + " <td>blue</td>\n", + " <td>Pacer</td>\n", + " <td>public</td>\n", + " <td>1081</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>2014</td>\n", + " <td>blue</td>\n", + " <td>Pacer</td>\n", + " <td>public</td>\n", + " <td>1052</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>2018</td>\n", + " <td>blue</td>\n", + " <td>Pacer</td>\n", + " <td>public</td>\n", + " <td>1109</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " year color style owner alt active\n", + "0 1999 red K-81 private 1179 0\n", + "1 2000 red M-3 public 1065 0\n", + "2 2001 green Pacer private 1058 1\n", + "3 2010 blue Pacer public 1081 1\n", + "4 2014 blue Pacer public 1052 1\n", + "5 2018 blue Pacer public 1109 1" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "hydrants" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>color</th>\n", + " <th>year</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>blue</td>\n", + " <td>2010</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>blue</td>\n", + " <td>2014</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>blue</td>\n", + " <td>2018</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " color year\n", + "0 blue 2010\n", + "1 blue 2014\n", + "2 blue 2018" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qry(\"SELECT color, year FROM hydrants WHERE color = 'blue' \")" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>color</th>\n", + " <th>year</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>blue</td>\n", + " <td>2010</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>blue</td>\n", + " <td>2014</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>blue</td>\n", + " <td>2018</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " color year\n", + "3 blue 2010\n", + "4 blue 2014\n", + "5 blue 2018" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = qry(\"SELECT color, year FROM hydrants\")\n", + "df[df.color == \"blue\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>year</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>2001</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " year\n", + "0 2001" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qry(\"SELECT year FROM hydrants WHERE owner='private' AND active\")" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2 Pacer\n", + "3 Pacer\n", + "4 Pacer\n", + "5 Pacer\n", + "Name: style, dtype: object" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = qry(\"SELECT year, style, active FROM hydrants\")\n", + "df[df.active == 1][\"style\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>year</th>\n", + " <th>color</th>\n", + " <th>style</th>\n", + " <th>owner</th>\n", + " <th>alt</th>\n", + " <th>active</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1999</td>\n", + " <td>red</td>\n", + " <td>K-81</td>\n", + " <td>private</td>\n", + " <td>1179</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>2000</td>\n", + " <td>red</td>\n", + " <td>M-3</td>\n", + " <td>public</td>\n", + " <td>1065</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2001</td>\n", + " <td>green</td>\n", + " <td>Pacer</td>\n", + " <td>private</td>\n", + " <td>1058</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>2010</td>\n", + " <td>blue</td>\n", + " <td>Pacer</td>\n", + " <td>public</td>\n", + " <td>1081</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>2014</td>\n", + " <td>blue</td>\n", + " <td>Pacer</td>\n", + " <td>public</td>\n", + " <td>1052</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>2018</td>\n", + " <td>blue</td>\n", + " <td>Pacer</td>\n", + " <td>public</td>\n", + " <td>1109</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " year color style owner alt active\n", + "0 1999 red K-81 private 1179 0\n", + "1 2000 red M-3 public 1065 0\n", + "2 2001 green Pacer private 1058 1\n", + "3 2010 blue Pacer public 1081 1\n", + "4 2014 blue Pacer public 1052 1\n", + "5 2018 blue Pacer public 1109 1" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hydrants" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "blue 3\n", + "red 2\n", + "green 1\n", + "Name: color, dtype: int64" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hydrants[\"color\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>color</th>\n", + " <th>COUNT(*)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>blue</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>green</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " color COUNT(*)\n", + "0 blue 3\n", + "1 green 1" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qry(\"\"\"SELECT color, COUNT(*) FROM hydrants WHERE active GROUP BY color\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>color</th>\n", + " <th>count</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>blue</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>red</td>\n", + " <td>2</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " color count\n", + "0 blue 3\n", + "1 red 2" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qry(\"\"\"SELECT color, COUNT(*) AS count FROM hydrants GROUP BY color HAVING count > 1\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>color</th>\n", + " <th>count</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>green</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>red</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " color count\n", + "0 green 1\n", + "1 red 1" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qry(\"\"\"SELECT color, COUNT(*) AS count\n", + " FROM hydrants WHERE year >= 2000\n", + " GROUP BY color HAVING count < 2\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/s24/Louis_Lecture_Notes/36_Database3/Lec36_WorksheetGuided.ipynb b/s24/Louis_Lecture_Notes/36_Database3/Lec36_WorksheetGuided.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..cbb06654ec539737dbf02c339c0536a338fe7daa --- /dev/null +++ b/s24/Louis_Lecture_Notes/36_Database3/Lec36_WorksheetGuided.ipynb @@ -0,0 +1,839 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ignore this cell - it makes the emphasized text red and uses the full width of the screen\n", + "from IPython.core.display import HTML\n", + "HTML('<style>em { color: red; }</style> <style>.container {width:100% !important; }</style>')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sqlite3\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conn = sqlite3.connect(\"worksheet.db\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# this function gives us a shortcut to making queries\n", + "# instead of typing all that code over and over again, we just call qry with our SQL\n", + "# it assumes we have access to a connection object, conn\n", + "\n", + "def qry(QUERY):\n", + " '''QUERY is a string containing SQL, conn is a global connection variable'''\n", + " return pd.read_sql(QUERY, conn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.read_sql(\"SELECT * from sqlite_master\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Remember that one database can hold several tables\n", + "hydrants = qry(\"SELECT * FROM hydrants\")\n", + "trees = qry(\"SELECT * FROM trees\")\n", + "species = qry(\"SELECT * FROM species\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# this is made-up data, but is inspired by an actual City of Madison database!\n", + "trees" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://data-cityofmadison.opendata.arcgis.com/datasets/b700541a20e446839b18d62426c266a3/explore?location=43.072110%2C-89.405159%2C18.00" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Databases typically split up data into manageable pieces\n", + "# It may be more efficient to keep the species codes separate, since they are rarely updated\n", + "species" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The City of Madison keeps data on fire hydrants!\n", + "hydrants" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://data-cityofmadison.opendata.arcgis.com/datasets/54c4877f16084409849ebd5385e2ee27_6/explore?location=43.071084%2C-89.403280%2C17.00" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1a. *Without* running this cell - *predict* the output of the following statement" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#trees[trees[\"priority\"] > 90] " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# DataFrame with Boolean Indexing # show only the columns in this list\n", + "#trees[trees[\"priority\"] > 90] [[\"x\", \"y\"]] " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1b. *Convert* the statement to an equivalent *SQL* querry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trees" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# your answer here" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "### 2a. *Predict* the output of the following *SQL* querry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trees" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#qry(\"SELECT x+y FROM trees WHERE species = 'm'\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2b. *Convert* the querry into an equivalent *pandas* statement." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Series\n", + "trees[\"x\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Series with Boolean indexing applied\n", + "#trees[\"x\"] [trees[\"species\"] == 'm']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Do the same for y " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# because the two Series have matching indices, we can add them\n", + "# this answer is acceptable on a quiz/exam\n", + "#trees[\"x\"][trees[\"species\"] == 'm'] + trees[\"y\"][trees[\"species\"] == 'm']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# if you want to get fancy, you can turn a Series into a DataFrame and add column names\n", + "#result2 = pd.DataFrame(trees[\"x\"][trees[\"species\"] == 'm'] + trees[\"y\"][trees[\"species\"] == 'm'])\n", + "#result2.columns = [\"x+y\"]\n", + "#result2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "### 3a. *Predict* the output of the following *pandas* statements" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "species" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# species[\"code\"] [species[\"species\"]==\"maple\"] .iloc[0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# this is a Series\n", + "#species[\"code\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Series with Boolean indexing applied\n", + "# species[\"code\"] [species[\"species\"]==\"maple\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Series with Boolean indexing applied get the value at integer location 0\n", + "# species[\"code\"] [species[\"species\"]==\"maple\"] .iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cd = species[\"code\"][species[\"species\"]==\"maple\"].iloc[0]\n", + "cd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# DataFrame with Boolean Indexing\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# DataFrame with Boolean Indexing with column selection\n", + "# trees [trees[\"species\"] == cd] ['tree']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3b. *Convert* the statements into an equivalent *SQL* querry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "qry(\"select code from species where species = 'maple' \") " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# DataFrame with column selection\n", + "#qry(\"select code from species where species = 'maple' \") ['code']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# DataFrame with column selection get the value at iloc 0\n", + "#cd = qry(\"select code from species where species = 'maple' \") ['code'] .iloc[0]\n", + "#cd\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# hard coding \n", + "#qry(\"select tree from trees where species = 'm'\" ) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# not hard coding\n", + "#qry(\"select tree from trees where species = '{}'\".format(cd))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "### 4a. *Predict* the output of the following querry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#qry(\"SELECT species FROM trees ORDER BY priority DESC\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.b *Convert* the querry code to *Pandas*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# DataFrame soted by priority # with column selection\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "### 5a. *Predict* the output of the following code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trees" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# list(qry(\"SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1\").iloc[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# qry(\"SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#qry(\"SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1\").iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# list gets the values only\n", + "#list(qry(\"SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1\").iloc[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.b *Convert* the above code to *Pandas*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trees.sort_values(\"priority\", ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# DataFrame # grab the first row\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# DataFrame # grab the first row #slicing by certain indices\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# list gets the values only\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "### 6a. *Predict* the output of the following code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# qry(\"\"\"SELECT COUNT(SPECIES) AS c1,\n", + "# COUNT(DISTINCT SPECIES) as c2\n", + "# FROM trees\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#qry(\"\"\"SELECT COUNT(SPECIES) AS c1,\n", + "# COUNT(DISTINCT SPECIES) as c2\n", + "# FROM trees\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6b. *Convert* the above code to *Pandas*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get the 5\n", + "c1 = None\n", + "c1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get the 2\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "c2 = None\n", + "#c2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# this answer is acceptable\n", + "[c1, c2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# A dataframe can be made from a dict of lists\n", + "d = {\"c1\":[c1], \"c2\":[c2]}\n", + "pd.DataFrame(d)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "### 7a. *Predict* the output of the following code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# qry(\"\"\"SELECT species, COUNT(SPECIES) AS count,\n", + "# AVG(diameter) AS size\n", + "# FROM trees\n", + "# GROUP BY species ORDER BY count DESC\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7b. *Convert* the above code to *Pandas*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# part 1: species list\n", + "# species_list = list(pd.unique(trees['species']))\n", + "species_list" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# trees.groupby(\"species\").mean(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# part 2: size\n", + "#size_list = list(trees.groupby(\"species\").mean(numeric_only=True)[\"diameter\"]) \n", + "size_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# part 3: counts\n", + "#count_list = list(trees['species'].value_counts())\n", + "count_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# part 4: make a DataFrame from a dict of lists\n", + "pd.DataFrame({\"species\": species_list,\n", + " \"count\": count_list,\n", + " \"size\": size_list})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "# Additional Exercises: \n", + "### *Predict* the output of the following statements" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "hydrants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#qry(\"SELECT color, year FROM hydrants WHERE color = 'blue' \")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#df = qry(\"SELECT color, year FROM hydrants\")\n", + "#df[df.color == \"blue\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#qry(\"SELECT year FROM hydrants WHERE owner='private' AND active\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#df = qry(\"SELECT year, style, active FROM hydrants\")\n", + "#df[df.active == 1][\"style\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hydrants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#hydrants[\"color\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#qry(\"\"\"SELECT color, COUNT(*) FROM hydrants WHERE active GROUP BY color\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#qry(\"\"\"SELECT color, COUNT(*) AS count FROM hydrants GROUP BY color HAVING count > 1\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#qry(\"\"\"SELECT color, COUNT(*) AS count\n", + " FROM hydrants WHERE year >= 2000\n", + " GROUP BY color HAVING count < 2\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/s24/Louis_Lecture_Notes/36_Database3/worksheet.db b/s24/Louis_Lecture_Notes/36_Database3/worksheet.db new file mode 100644 index 0000000000000000000000000000000000000000..3607f4107144d2a559212f8e1813b005c499747c Binary files /dev/null and b/s24/Louis_Lecture_Notes/36_Database3/worksheet.db differ