diff --git a/f22/andy_lec_notes/lec15_Oct12_CSV_Files/lec15_csvfiles_completed.ipynb b/f22/andy_lec_notes/lec15_Oct12_CSV_Files/lec15_csvfiles_completed.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..50a42131b8d0ea029df0f8306482e629eb58958e --- /dev/null +++ b/f22/andy_lec_notes/lec15_Oct12_CSV_Files/lec15_csvfiles_completed.ipynb @@ -0,0 +1,904 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Warmup #1: Take a look at these list methods \n", + "# https://www.w3schools.com/python/python_ref_list.asp\n", + "dairy = [\"milk\", \"ice cream\", \"cheese\", \"yogurt\" ]\n", + "\n", + "#use the .index() method to get the index of \"ice cream\"\n", + "\n", + "dairy.index(\"ice cream\")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "peanut butter is not dairy\n", + "milk is dairy\n", + "bread is not dairy\n", + "cheese is dairy\n", + "YOGURT is not dairy\n" + ] + } + ], + "source": [ + "# Warmup #2: Because a list is a sequence, we can use the 'in' operator\n", + "food_shelf = [\"peanut butter\", \"milk\", \"bread\", \"cheese\", \"YOGURT\"]\n", + "for item in food_shelf:\n", + " if item in dairy:\n", + " print(item, \"is dairy\")\n", + " else:\n", + " print(item, \"is not dairy\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lecture 15: CSV Files\n", + "\n", + "\n", + "## Learning Objectives\n", + "After this lecture you will be able to...\n", + "- Open an Excel file and export it to a Comma Separated Value file.\n", + "\n", + "- Open a CSV file in TextEditor/Jupyter and connect the elements of the CSV file to the rows and columns in the spreadsheet.\n", + "\n", + "- Use pre-written Python code to read a CSV file into a list of lists.\n", + "\n", + "- Write Python statements with double list indexing to access any element of a CSV file via a list of lists.\n", + "\n", + "- Write code that answers questions about CSV data by writing for loops on lists of lists.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "# Open the file of student survey data in Jupyter\n", + "\n", + "# Then open it in Windows ... what program opened? " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Open a CSV file in TextEditor/Jupyter and connect the elements of the CSV file to the rows and columns in the spreadsheet." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "# What do you notice? Take notes here\n", + "# " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use pre-written Python code to read a CSV file into a list of lists." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "# inspired by https://automatetheboringstuff.com/chapter14/\n", + "import csv\n", + "\n", + "def process_csv(filename):\n", + " # open the file, its a text file utf-8\n", + " exampleFile = open(filename, encoding=\"utf-8\") \n", + " \n", + " # prepare it for reading as a CSV object\n", + " exampleReader = csv.reader(exampleFile) \n", + " \n", + " # use the built-in list function to convert this into a list of lists\n", + " exampleData = list(exampleReader) \n", + " \n", + " # close the file to tidy up our workspace\n", + " exampleFile.close() \n", + " \n", + " # return the list of lists\n", + " return exampleData\n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['pizza topping',\n", + " 'state',\n", + " 'years',\n", + " 'sleep preference',\n", + " 'month',\n", + " 'pets',\n", + " 'lat-long'],\n", + " ['mushroom',\n", + " 'Florida',\n", + " '7',\n", + " 'early bird',\n", + " 'March',\n", + " '',\n", + " '30.263214888389417, -81.54792098150529'],\n", + " ['pineapple',\n", + " 'Wisconsin',\n", + " '4',\n", + " 'night owl',\n", + " 'April',\n", + " 'other',\n", + " '43.1581437, -89.2921125']]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Call the process_csv function and store the list of lists in cs220_csv\n", + "cs220_csv = process_csv('amfam_survey_data.csv')\n", + "cs220_csv[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['pizza topping',\n", + " 'state',\n", + " 'years',\n", + " 'sleep preference',\n", + " 'month',\n", + " 'pets',\n", + " 'lat-long']" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Store the header row into cs220_header\n", + "cs220_header = cs220_csv[0]\n", + "cs220_header" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['mushroom',\n", + " 'Florida',\n", + " '7',\n", + " 'early bird',\n", + " 'March',\n", + " '',\n", + " '30.263214888389417, -81.54792098150529'],\n", + " ['pineapple',\n", + " 'Wisconsin',\n", + " '4',\n", + " 'night owl',\n", + " 'April',\n", + " 'other',\n", + " '43.1581437, -89.2921125'],\n", + " ['sausage',\n", + " 'Wisconsin',\n", + " '10',\n", + " 'early bird',\n", + " 'July',\n", + " 'other',\n", + " '43.15645, -89.28814'],\n", + " ['pepperoni',\n", + " 'WI',\n", + " '7',\n", + " 'no preference',\n", + " 'September',\n", + " 'dog,cat',\n", + " '43.073051, -89.401230'],\n", + " ['mushroom', 'madison', '7', 'early bird', 'November', '', ''],\n", + " ['pepperoni',\n", + " 'FL',\n", + " '1',\n", + " 'no preference',\n", + " 'December',\n", + " 'dog',\n", + " '42.35623761108948, -71.05691488946681'],\n", + " ['pepperoni',\n", + " 'Wisconsin',\n", + " '2',\n", + " 'night owl',\n", + " 'February',\n", + " '',\n", + " '43.159045128642774, -89.29146323507756'],\n", + " ['mushroom',\n", + " 'Florida',\n", + " '0.5',\n", + " 'night owl',\n", + " 'May',\n", + " 'other',\n", + " '43.160601, -89.287671'],\n", + " ['mushroom',\n", + " 'Wisconsin',\n", + " '10',\n", + " 'no preference',\n", + " 'January',\n", + " 'dog,fish',\n", + " '43.1562216,-89.2880086'],\n", + " ['pineapple',\n", + " 'Wisconsin',\n", + " '8',\n", + " 'night owl',\n", + " 'July',\n", + " 'dog',\n", + " '43.158655, -89.289895'],\n", + " ['sausage',\n", + " 'Minnesota',\n", + " '15',\n", + " 'no preference',\n", + " 'August',\n", + " 'dog,cat',\n", + " '45.13881645889933, -93.47636590830673'],\n", + " ['pepperoni',\n", + " 'New Jersey',\n", + " '1',\n", + " 'night owl',\n", + " 'May',\n", + " 'other',\n", + " '43.07148896663423, -89.40567798752735'],\n", + " ['basil',\n", + " 'Rhode Island',\n", + " '1',\n", + " 'night owl',\n", + " 'March',\n", + " 'dog',\n", + " '43.156490793353775, -89.28796434617352'],\n", + " ['mushroom', 'TX', '1', 'no preference', 'January', 'dog', ''],\n", + " ['pineapple',\n", + " 'Florida',\n", + " '3',\n", + " 'early bird',\n", + " 'July',\n", + " 'other',\n", + " '27.979191147972834, -82.33356380365498'],\n", + " ['sausage',\n", + " 'Wisconsin',\n", + " '0',\n", + " 'early bird',\n", + " 'December',\n", + " 'dog,cat',\n", + " '43.15631441766965, -89.28785659081201'],\n", + " ['pineapple',\n", + " 'Wisconsin',\n", + " '6',\n", + " 'no preference',\n", + " 'June',\n", + " 'dog',\n", + " '43.157716440341964, -89.28939262164963'],\n", + " ['mushroom',\n", + " 'Florida',\n", + " '7',\n", + " 'no preference',\n", + " 'July',\n", + " 'other',\n", + " '30.053546, -81.514610'],\n", + " ['sausage',\n", + " 'Florida',\n", + " '3',\n", + " 'early bird',\n", + " 'January',\n", + " 'dog,fish',\n", + " '30.263357, -81.547884'],\n", + " ['mac&cheese',\n", + " 'Wisconsin',\n", + " '5',\n", + " 'night owl',\n", + " 'July',\n", + " 'dog',\n", + " '43.158328032172754, -89.28946714938327'],\n", + " ['pepperoni',\n", + " 'Wisconsin',\n", + " '10',\n", + " 'early bird',\n", + " 'April',\n", + " 'other',\n", + " '43.1884213,-89.2762121'],\n", + " ['other',\n", + " 'Wisconsin',\n", + " '10',\n", + " 'early bird',\n", + " 'August',\n", + " 'other',\n", + " '43.15833, -89.28988'],\n", + " ['sausage',\n", + " 'WI',\n", + " '14',\n", + " 'night owl',\n", + " 'September',\n", + " 'dog,cat',\n", + " '43.15733597381252, -89.29013010509833'],\n", + " ['sausage',\n", + " 'Wisconsin',\n", + " '6',\n", + " 'no preference',\n", + " 'August',\n", + " 'dog,cat',\n", + " '43.159061371631616, -89.29141118826759'],\n", + " ['pepperoni',\n", + " 'Wisconsin',\n", + " '8',\n", + " 'early bird',\n", + " 'September',\n", + " 'dog,cat,fish',\n", + " '43.158359 -89.289972'],\n", + " ['pineapple',\n", + " 'Florida',\n", + " '8',\n", + " 'night owl',\n", + " 'October',\n", + " '',\n", + " '30.263432655702932, -81.54807118535949'],\n", + " ['pineapple',\n", + " 'TX',\n", + " '4',\n", + " 'night owl',\n", + " 'October',\n", + " 'dog',\n", + " '42.3558293029345, -71.05683171712127'],\n", + " ['other', 'WI', '2', 'early bird', 'June', '', ''],\n", + " ['mushroom',\n", + " 'Wisconsin',\n", + " '20',\n", + " 'early bird',\n", + " 'September',\n", + " 'dog',\n", + " '43.15826500058843, -89.28945716165009'],\n", + " ['sausage',\n", + " 'Wisconsin',\n", + " '8',\n", + " 'night owl',\n", + " 'June',\n", + " 'dog',\n", + " '43.15839022178169, -89.28998287477457'],\n", + " ['sausage',\n", + " 'Wisconsin',\n", + " '20',\n", + " 'night owl',\n", + " 'April',\n", + " 'bird',\n", + " '43.15648555750267, -89.28783647996661'],\n", + " ['pineapple', 'Texas', '0.5', 'early bird', 'August', 'other', '43, 89']]" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Store all of the data rows into cs220_data\n", + "cs220_data = cs220_csv[1:]\n", + "cs220_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CSVs as a List of Lists" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "32" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Determine how many students completed the survey.\n", + "len(cs220_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['mushroom',\n", + " 'Florida',\n", + " '7',\n", + " 'early bird',\n", + " 'March',\n", + " '',\n", + " '30.263214888389417, -81.54792098150529']" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# show the entire 1st row of actual data\n", + "cs220_data[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'pineapple'" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find the pizza topping for the 2nd student...by hardcoding its row and column....\n", + "cs220_data[1][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'43.073051, -89.401230'" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find the lat-long of the 4th student...by hardcoding its row and column....\n", + "cs220_data[3][-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "early bird\t\t\n", + "night owl\t\tother\n", + "early bird\t\tother\n", + "no preference\t\tdog,cat\n", + "early bird\t\t\n", + "no preference\t\tdog\n", + "night owl\t\t\n", + "night owl\t\tother\n", + "no preference\t\tdog,fish\n", + "night owl\t\tdog\n", + "no preference\t\tdog,cat\n", + "night owl\t\tother\n", + "night owl\t\tdog\n", + "no preference\t\tdog\n", + "early bird\t\tother\n", + "early bird\t\tdog,cat\n", + "no preference\t\tdog\n", + "no preference\t\tother\n", + "early bird\t\tdog,fish\n", + "night owl\t\tdog\n", + "early bird\t\tother\n", + "early bird\t\tother\n", + "night owl\t\tdog,cat\n", + "no preference\t\tdog,cat\n", + "early bird\t\tdog,cat,fish\n", + "night owl\t\t\n", + "night owl\t\tdog\n", + "early bird\t\t\n", + "early bird\t\tdog\n", + "night owl\t\tdog\n", + "night owl\t\tbird\n", + "early bird\t\tother\n" + ] + } + ], + "source": [ + "# Print out every student's sleep habit and pets\n", + "# use for \n", + "for row in cs220_data:\n", + " current_sleep_habit = row[cs220_header.index(\"sleep preference\")]\n", + " pets = row[cs220_header.index(\"pets\")]\n", + " print(current_sleep_habit + '\\t\\t' + pets)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Florida 7.0\n", + "FL 1.0\n", + "Florida 0.5\n", + "Minnesota 15.0\n", + "New Jersey 1.0\n", + "Rhode Island 1.0\n", + "TX 1.0\n", + "Florida 3.0\n", + "Florida 7.0\n", + "Florida 3.0\n", + "Florida 8.0\n", + "TX 4.0\n", + "Texas 0.5\n" + ] + }, + { + "data": { + "text/plain": [ + "4.0" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find the average years of people who do not live in Wisconsin\n", + "total_years = 0\n", + "count = 0\n", + "for row in cs220_data:\n", + " years = float(row[cs220_header.index(\"years\")])\n", + " state = (row[cs220_header.index(\"state\")])\n", + " if state not in [\"Wisconsin\", \"WI\", \"madison\"]:\n", + " total_years += years\n", + " count += 1\n", + " print(state, years)\n", + "avg = total_years / count\n", + "avg" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## It would be nice to have a helper function!\n", + "A function that easily accesses a `cell`" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['pizza topping',\n", + " 'state',\n", + " 'years',\n", + " 'sleep preference',\n", + " 'month',\n", + " 'pets',\n", + " 'lat-long']" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# How do we get the names of all the columns?\n", + "cs220_header" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get the column index of \"Pizza topping\"\n", + "cs220_header.index(\"pizza topping\")" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "# We want to invoke something like...\n", + "# cell(24, <colName>)\n", + "def cell_v1(row_idx, col_name):\n", + " col_idx = cs220_header.index(col_name) # get the index of col_name\n", + " val = cs220_data[row_idx][col_idx] # get the value of cs220_data at the specified cell\n", + " return val" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'WI'" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Print out the state of the 4th student... using the cell function\n", + "cell_v1(3, \"state\")" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Florida 7.0\n", + "FL 1.0\n", + "Florida 0.5\n", + "Minnesota 15.0\n", + "New Jersey 1.0\n", + "Rhode Island 1.0\n", + "TX 1.0\n", + "Florida 3.0\n", + "Florida 7.0\n", + "Florida 3.0\n", + "Florida 8.0\n", + "TX 4.0\n", + "Texas 0.5\n" + ] + }, + { + "data": { + "text/plain": [ + "4.0" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Redo above probllem cell function\n", + "# Find the average years of people who do not live in Wisconsin\n", + "total_years = 0\n", + "count = 0\n", + "for i in range(len(cs220_data)):\n", + " years = float(cell_v1(i, \"years\"))\n", + " state = cell_v1(i, \"state\")\n", + " if state not in [\"Wisconsin\", \"WI\", \"madison\"]:\n", + " total_years += years\n", + " count += 1\n", + " print(state, years)\n", + "avg = total_years / count\n", + "avg" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['pizza topping',\n", + " 'state',\n", + " 'years',\n", + " 'sleep preference',\n", + " 'month',\n", + " 'pets',\n", + " 'lat-long']" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# redo .... using the cell function\n", + "cs220_header" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "# Improve the cell function so it returns the appropriate type.\n", + "# If there is nothing in the cell, return None\n", + "# otherwise, use the col_name to convert to the expected type\n", + "def cell(row_idx, col_name):\n", + " col_idx = cs220_header.index(col_name) # get the index of col_name\n", + " val = cs220_data[row_idx][col_idx] # get the value of cs220_data at the specified cell\n", + " if col_idx == 2:\n", + " return float(val)\n", + " else:\n", + " return val" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'43.15645, -89.28814'" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# # redo again using the improved cell function\n", + "cell(2, \"lat-long\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Practice problems" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# What percentage of students chose a non-meat pizza topping?\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# What do you want to find out ? Try it out on your own" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}