Skip to content
Snippets Groups Projects
Commit 0a3b8b7a authored by Cole Nelson's avatar Cole Nelson
Browse files

cole lec35

parent 92ac16fc
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
import pandas as pd import pandas as pd
from pandas import DataFrame, Series from pandas import DataFrame, Series
   
import sqlite3 import sqlite3
import os import os
   
import matplotlib import matplotlib
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
   
import requests import requests
matplotlib.rcParams["font.size"] = 12 matplotlib.rcParams["font.size"] = 12
``` ```
   
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
   
![image.png](attachment:image.png)
%% Cell type:markdown id: tags:
### IRIS dataset: http://archive.ics.uci.edu/ml/datasets/iris ### IRIS dataset: http://archive.ics.uci.edu/ml/datasets/iris
- This set of data is used in beginning Machine Learning Courses - This set of data is used in beginning Machine Learning Courses
- You can train a ML algorithm to use the values to predict the class of iris - You can train a ML algorithm to use the values to predict the class of iris
- Dataset link: https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data - Dataset link: https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# Warmup 1: Requests and file writing # Warmup 1: Requests and file writing
   
# use requests to get this file "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" # use requests to get this file "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
response = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data") response = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")
   
# check that the request was successful # check that the request was successful
response.raise_for_status() response.raise_for_status()
   
# open a file called "iris.csv" for writing the data locally to avoid spamming their server # open a file called "iris.csv" for writing the data locally to avoid spamming their server
file_obj = open("iris.csv", "w") file_obj = open("iris.csv", "w")
   
# write the text of response to the file object # write the text of response to the file object
file_obj.write(response.text) file_obj.write(response.text)
   
# close the file object # close the file object
file_obj.close() file_obj.close()
   
# Look at the file you downloaded. What's wrong with it? # Look at the file you downloaded. What's wrong with it?
``` ```
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# Warmup 2: Making a DataFrame # Warmup 2: Making a DataFrame
   
# read the "iris.csv" file into a Pandas dataframe # read the "iris.csv" file into a Pandas dataframe
iris_df = pd.read_csv("iris.csv") iris_df = pd.read_csv("iris.csv")
   
# display the head of the data frame # display the head of the data frame
iris_df.head() iris_df.head()
``` ```
   
%% Output %% Output
   
5.1 3.5 1.4 0.2 Iris-setosa 5.1 3.5 1.4 0.2 Iris-setosa
0 4.9 3.0 1.4 0.2 Iris-setosa 0 4.9 3.0 1.4 0.2 Iris-setosa
1 4.7 3.2 1.3 0.2 Iris-setosa 1 4.7 3.2 1.3 0.2 Iris-setosa
2 4.6 3.1 1.5 0.2 Iris-setosa 2 4.6 3.1 1.5 0.2 Iris-setosa
3 5.0 3.6 1.4 0.2 Iris-setosa 3 5.0 3.6 1.4 0.2 Iris-setosa
4 5.4 3.9 1.7 0.4 Iris-setosa 4 5.4 3.9 1.7 0.4 Iris-setosa
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# Warmup 3: Our CSV file has no header....let's add column names. # Warmup 3: Our CSV file has no header....let's add column names.
# Refer to the documentation: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html # Refer to the documentation: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
   
# Attribute Information: # Attribute Information:
# 1. sepal length in cm # 1. sepal length in cm
# 2. sepal width in cm # 2. sepal width in cm
# 3. petal length in cm # 3. petal length in cm
# 4. petal width in cm # 4. petal width in cm
# 5. class: Iris Setosa, Iris Versicolour, Iris Virginica # 5. class: Iris Setosa, Iris Versicolour, Iris Virginica
   
# These should be our headers ["sep-length", "sep-width", "pet-length", "pet-width", "class"] # These should be our headers ["sep-length", "sep-width", "pet-length", "pet-width", "class"]
   
   
iris_df = pd.read_csv("iris.csv", iris_df = pd.read_csv("iris.csv",
names=["sep-length", "sep-width", "pet-length", "pet-width", "class"]) names=["sep-length", "sep-width", "pet-length", "pet-width", "class"])
iris_df.head() iris_df.head()
``` ```
   
%% Output %% Output
   
sep-length sep-width pet-length pet-width class sep-length sep-width pet-length pet-width class
0 5.1 3.5 1.4 0.2 Iris-setosa 0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# Warmup 4: Connect to our database version of this data! # Warmup 4: Connect to our database version of this data!
iris_conn = sqlite3.connect("iris-flowers.db") iris_conn = sqlite3.connect("iris-flowers.db")
pd.read_sql("SELECT * FROM sqlite_master WHERE type='table'", iris_conn) pd.read_sql("SELECT * FROM sqlite_master WHERE type='table'", iris_conn)
``` ```
   
%% Output %% Output
   
type name tbl_name rootpage \ type name tbl_name rootpage \
0 table iris iris 2 0 table iris iris 2
sql sql
0 CREATE TABLE "iris" (\n"sep-length" REAL,\n "... 0 CREATE TABLE "iris" (\n"sep-length" REAL,\n "...
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# Warmup 5: Using SQL, get the 10 'Iris-setosa' flowers with the longest sepal length. # Warmup 5: Using SQL, get the 10 'Iris-setosa' flowers with the longest sepal length.
# Break any ties by ordering by the shortest sepal width. # Break any ties by ordering by the shortest sepal width.
   
pd.read_sql(""" pd.read_sql("""
SELECT * SELECT *
FROM iris FROM iris
WHERE class = 'Iris-setosa' WHERE class = 'Iris-setosa'
ORDER BY `sep-length` DESC, `sep-width` ASC ORDER BY `sep-length` DESC, `sep-width` ASC
LIMIT 10 LIMIT 10
""", iris_conn) """, iris_conn)
``` ```
   
%% Output %% Output
   
sep-length sep-width pet-length pet-width class sep-length sep-width pet-length pet-width class
0 5.8 4.0 1.2 0.2 Iris-setosa 0 5.8 4.0 1.2 0.2 Iris-setosa
1 5.7 3.8 1.7 0.3 Iris-setosa 1 5.7 3.8 1.7 0.3 Iris-setosa
2 5.7 4.4 1.5 0.4 Iris-setosa 2 5.7 4.4 1.5 0.4 Iris-setosa
3 5.5 3.5 1.3 0.2 Iris-setosa 3 5.5 3.5 1.3 0.2 Iris-setosa
4 5.5 4.2 1.4 0.2 Iris-setosa 4 5.5 4.2 1.4 0.2 Iris-setosa
5 5.4 3.4 1.7 0.2 Iris-setosa 5 5.4 3.4 1.7 0.2 Iris-setosa
6 5.4 3.4 1.5 0.4 Iris-setosa 6 5.4 3.4 1.5 0.4 Iris-setosa
7 5.4 3.7 1.5 0.2 Iris-setosa 7 5.4 3.7 1.5 0.2 Iris-setosa
8 5.4 3.9 1.7 0.4 Iris-setosa 8 5.4 3.9 1.7 0.4 Iris-setosa
9 5.4 3.9 1.3 0.4 Iris-setosa 9 5.4 3.9 1.3 0.4 Iris-setosa
   
%% Cell type:code id: tags:
``` python
# Warmup 6: Using SQL, get the average petal length and width for each class of flower.
pd.read_sql("""
SELECT class, AVG("pet-length"), AVG("pet-width")
FROM iris
GROUP BY class
""", iris_conn)
```
%% Output
class AVG("pet-length") AVG("pet-width")
0 Iris-setosa 1.464 0.244
1 Iris-versicolor 4.260 1.326
2 Iris-virginica 5.552 2.026
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
   
# Lecture 36: Scatter Plots # Lecture 35: Scatter Plots
**Learning Objectives** **Learning Objectives**
- Set the marker, color, and size of scatter plot data - Set the marker, color, and size of scatter plot data
- Calculate correlation between DataFrame columns - Calculate correlation between DataFrame columns
- Use subplots to group scatterplot data - Use subplots to group scatterplot data
   
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
   
## Set the marker, color, and size of scatter plot data ## Set the marker, color, and size of scatter plot data
   
To start, let's look at some made-up data about Trees. To start, let's look at some made-up data about Trees.
The city of Madison maintains a database of all the trees they care for. The city of Madison maintains a database of all the trees they care for.
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
trees = [ trees = [
{"age": 1, "height": 1.5, "diameter": 0.8}, {"age": 1, "height": 1.5, "diameter": 0.8},
{"age": 1, "height": 1.9, "diameter": 1.2}, {"age": 1, "height": 1.9, "diameter": 1.2},
{"age": 1, "height": 1.8, "diameter": 1.4}, {"age": 1, "height": 1.8, "diameter": 1.4},
{"age": 2, "height": 1.8, "diameter": 0.9}, {"age": 2, "height": 1.8, "diameter": 0.9},
{"age": 2, "height": 2.5, "diameter": 1.5}, {"age": 2, "height": 2.5, "diameter": 1.5},
{"age": 2, "height": 3, "diameter": 1.8}, {"age": 2, "height": 3, "diameter": 1.8},
{"age": 2, "height": 2.9, "diameter": 1.7}, {"age": 2, "height": 2.9, "diameter": 1.7},
{"age": 3, "height": 3.2, "diameter": 2.1}, {"age": 3, "height": 3.2, "diameter": 2.1},
{"age": 3, "height": 3, "diameter": 2}, {"age": 3, "height": 3, "diameter": 2},
{"age": 3, "height": 2.4, "diameter": 2.2}, {"age": 3, "height": 2.4, "diameter": 2.2},
{"age": 2, "height": 3.1, "diameter": 2.9}, {"age": 2, "height": 3.1, "diameter": 2.9},
{"age": 4, "height": 2.5, "diameter": 3.1}, {"age": 4, "height": 2.5, "diameter": 3.1},
{"age": 4, "height": 3.9, "diameter": 3.1}, {"age": 4, "height": 3.9, "diameter": 3.1},
{"age": 4, "height": 4.9, "diameter": 2.8}, {"age": 4, "height": 4.9, "diameter": 2.8},
{"age": 4, "height": 5.2, "diameter": 3.5}, {"age": 4, "height": 5.2, "diameter": 3.5},
{"age": 4, "height": 4.8, "diameter": 4}, {"age": 4, "height": 4.8, "diameter": 4},
] ]
trees_df = DataFrame(trees) trees_df = DataFrame(trees)
trees_df.head() trees_df.head()
``` ```
   
%% Output %% Output
   
age height diameter age height diameter
0 1 1.5 0.8 0 1 1.5 0.8
1 1 1.9 1.2 1 1 1.9 1.2
2 1 1.8 1.4 2 1 1.8 1.4
3 2 1.8 0.9 3 2 1.8 0.9
4 2 2.5 1.5 4 2 2.5 1.5
   
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
   
### Scatter Plots ### Scatter Plots
We can make a scatter plot of a DataFrame using the following function... We can make a scatter plot of a DataFrame using the following function...
   
`df_name.plot.scatter(x="x_col_name", y="y_col_name", color="peachpuff")` `df_name.plot.scatter(x="x_col_name", y="y_col_name", color="peachpuff")`
   
Plot the trees data comparing a tree's age to its height... Plot the trees data comparing a tree's age to its height...
- What is `df_name`? - What is `df_name`?
- What is `x_col_name`? - What is `x_col_name`?
- What is `y_col_name`? - What is `y_col_name`?
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
trees_df.plot.scatter(x="age", y="height", color = "g") # TODO: change y to diameter trees_df.plot.scatter(x="age", y="height", color = "g") # TODO: change y to diameter
``` ```
   
%% Output %% Output
   
<Axes: xlabel='age', ylabel='height'> <Axes: xlabel='age', ylabel='height'>
   
   
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
   
Now plot with a little more beautification... Now plot with a little more beautification...
- Use a new [color](https://matplotlib.org/3.5.0/_images/sphx_glr_named_colors_003.png) - Use a new [color](https://matplotlib.org/3.5.0/_images/sphx_glr_named_colors_003.png)
- Use a type of [marker](https://matplotlib.org/stable/api/markers_api.html) - Use a type of [marker](https://matplotlib.org/stable/api/markers_api.html)
- Change the size (any int) - Change the size (any int)
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# Plot with some more beautification options. # Plot with some more beautification options.
trees_df.plot.scatter(x="age", y="height", color="r", marker = "D", s=50) # D for diamond trees_df.plot.scatter(x="age", y="height", color="r", marker = "D", s=50) # D for diamond
``` ```
   
%% Output %% Output
   
<Axes: xlabel='age', ylabel='height'> <Axes: xlabel='age', ylabel='height'>
   
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# Add a title to your plot. # Add a title to your plot.
ax = trees_df.plot.scatter(x="age", y="height", color="r", marker = "D", s=50) # D for diamond ax = trees_df.plot.scatter(x="age", y="height", color="r", marker = "D", s=50) # D for diamond
ax.set_title("Tree Age vs Height") ax.set_title("Tree Age vs Height")
``` ```
   
%% Output %% Output
   
Text(0.5, 1.0, 'Tree Age vs Height') Text(0.5, 1.0, 'Tree Age vs Height')
   
   
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
   
#### Correlation #### Correlation
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# What is the correlation between our DataFrame columns? # What is the correlation between our DataFrame columns?
corr_df = trees_df.corr() corr_df = trees_df.corr()
corr_df corr_df
``` ```
   
%% Output %% Output
   
age height diameter age height diameter
age 1.000000 0.797468 0.854578 age 1.000000 0.797468 0.854578
height 0.797468 1.000000 0.839345 height 0.797468 1.000000 0.839345
diameter 0.854578 0.839345 1.000000 diameter 0.854578 0.839345 1.000000
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# What is the correlation between age and height (don't use .iloc) # What is the correlation between age and height (don't use .iloc)
corr_df['age']['height'] corr_df['age']['height']
``` ```
   
%% Output %% Output
   
0.7974683544303798 0.7974683544303798
   
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
   
### Variating Stylistic Parameters ### Variating Stylistic Parameters
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# Option 1: # Option 1:
trees_df.plot.scatter(x="age", y="height", marker="H", s="diameter") trees_df.plot.scatter(x="age", y="height", marker="H", s="diameter")
``` ```
   
%% Output %% Output
   
<Axes: xlabel='age', ylabel='height'> <Axes: xlabel='age', ylabel='height'>
   
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# Option 2: # Option 2:
trees_df.plot.scatter(x="age", y="height", marker = "H", s=trees_df["diameter"] * 50) # this way allows you to make it bigger trees_df.plot.scatter(x="age", y="height", marker = "H", s=trees_df["diameter"] * 50) # this way allows you to make it bigger
``` ```
   
%% Output %% Output
   
<Axes: xlabel='age', ylabel='height'> <Axes: xlabel='age', ylabel='height'>
   
   
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
   
## Use subplots to group scatterplot data ## Use subplots to group scatterplot data
   
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
   
### Re-visit the Iris Data ### Re-visit the Iris Data
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
iris_df iris_df
``` ```
   
%% Output %% Output
   
sep-length sep-width pet-length pet-width class sep-length sep-width pet-length pet-width class
0 5.1 3.5 1.4 0.2 Iris-setosa 0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa
.. ... ... ... ... ... .. ... ... ... ... ...
145 6.7 3.0 5.2 2.3 Iris-virginica 145 6.7 3.0 5.2 2.3 Iris-virginica
146 6.3 2.5 5.0 1.9 Iris-virginica 146 6.3 2.5 5.0 1.9 Iris-virginica
147 6.5 3.0 5.2 2.0 Iris-virginica 147 6.5 3.0 5.2 2.0 Iris-virginica
148 6.2 3.4 5.4 2.3 Iris-virginica 148 6.2 3.4 5.4 2.3 Iris-virginica
149 5.9 3.0 5.1 1.8 Iris-virginica 149 5.9 3.0 5.1 1.8 Iris-virginica
[150 rows x 5 columns] [150 rows x 5 columns]
   
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
   
### How do we create a *scatter plot* for various *class types*? ### How do we create a *scatter plot* for various *class types*?
First, gather all the class types. First, gather all the class types.
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# In Pandas # In Pandas
varietes = list(set(iris_df["class"])) varietes = list(set(iris_df["class"]))
varietes varietes
``` ```
   
%% Output %% Output
   
['Iris-versicolor', 'Iris-setosa', 'Iris-virginica'] ['Iris-virginica', 'Iris-setosa', 'Iris-versicolor']
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# In SQL # In SQL
varietes = list(pd.read_sql(""" varietes = list(pd.read_sql("""
SELECT DISTINCT class SELECT DISTINCT class
FROM iris FROM iris
""", iris_conn)["class"]) """, iris_conn)["class"])
varietes varietes
``` ```
   
%% Output %% Output
   
['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'] ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
   
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
   
In reality, you can choose to write Pandas or SQL queries (or a mix of both!). For the rest of this lecture, we'll use Pandas. In reality, you can choose to write Pandas or SQL queries (or a mix of both!). For the rest of this lecture, we'll use Pandas.
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# If you want to continue using SQL instead, don't close the connection! # If you want to continue using SQL instead, don't close the connection!
iris_conn.close() iris_conn.close()
``` ```
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# Change this scatter plot so that the data is only for class ='Iris-setosa' # Change this scatter plot so that the data is only for class ='Iris-setosa'
iris_df[iris_df["class"] == 'Iris-setosa'].plot.scatter(x = "pet-width", y = "pet-length") iris_df[iris_df["class"] == 'Iris-setosa'].plot.scatter(x = "pet-width", y = "pet-length")
``` ```
   
%% Output %% Output
   
<Axes: xlabel='pet-width', ylabel='pet-length'> <Axes: xlabel='pet-width', ylabel='pet-length'>
   
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# Write a for loop that iterates through each variety in classes # Write a for loop that iterates through each variety in classes
# and makes a plot for only that class # and makes a plot for only that class
   
for i in range(len(varietes)): for i in range(len(varietes)):
variety = varietes[i] variety = varietes[i]
   
# make a df just of just the data for this variety # make a df just of just the data for this variety
variety_df = iris_df[iris_df["class"] == variety] variety_df = iris_df[iris_df["class"] == variety]
   
#make a scatter plot for this variety #make a scatter plot for this variety
variety_df.plot.scatter(x = "pet-width", y = "pet-length", label=variety) variety_df.plot.scatter(x = "pet-width", y = "pet-length", label=variety)
``` ```
   
%% Output %% Output
   
   
   
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# copy/paste the code above, but this time make each plot a different color # copy/paste the code above, but this time make each plot a different color
colors = ["blue", "green", "red"] colors = ["blue", "green", "red"]
for i in range(len(varietes)): for i in range(len(varietes)):
variety = varietes[i] variety = varietes[i]
   
# make a df just of just the data for this variety # make a df just of just the data for this variety
variety_df = iris_df[iris_df["class"] == variety] variety_df = iris_df[iris_df["class"] == variety]
   
#make a scatter plot for this variety #make a scatter plot for this variety
variety_df.plot.scatter(x = "pet-width", y = "pet-length", label=variety, color=colors[i]) variety_df.plot.scatter(x = "pet-width", y = "pet-length", label=variety, color=colors[i])
``` ```
   
%% Output %% Output
   
   
   
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# copy/paste the code above, but this time make each plot a different color AND marker # copy/paste the code above, but this time make each plot a different color AND marker
colors = ["blue", "green", "red"] colors = ["blue", "green", "red"]
markers = ["o", "^", "v"] markers = ["o", "^", "v"]
for i in range(len(varietes)): for i in range(len(varietes)):
variety = varietes[i] variety = varietes[i]
   
# make a df just of just the data for this variety # make a df just of just the data for this variety
variety_df = iris_df[iris_df["class"] == variety] variety_df = iris_df[iris_df["class"] == variety]
   
#make a scatter plot for this variety #make a scatter plot for this variety
variety_df.plot.scatter(x = "pet-width", y = "pet-length", label=variety, color=colors[i], marker=markers[i]) variety_df.plot.scatter(x = "pet-width", y = "pet-length", label=variety, color=colors[i], marker=markers[i])
``` ```
   
%% Output %% Output
   
   
   
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# Did you notice that it made 3 plots?!?! What's decieving about this? # Did you notice that it made 3 plots?!?! What's decieving about this?
``` ```
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
colors = ["blue", "green", "red"] colors = ["blue", "green", "red"]
markers = ["o", "^", "v"] markers = ["o", "^", "v"]
min_x = iris_df["pet-width"].min() min_x = iris_df["pet-width"].min()
max_x = iris_df["pet-width"].max() max_x = iris_df["pet-width"].max()
min_y = iris_df["pet-length"].min() min_y = iris_df["pet-length"].min()
max_y = iris_df["pet-length"].max() max_y = iris_df["pet-length"].max()
for i in range(len(varietes)): for i in range(len(varietes)):
variety = varietes[i] variety = varietes[i]
   
# make a df just of just the data for this variety # make a df just of just the data for this variety
variety_df = iris_df[iris_df["class"] == variety] variety_df = iris_df[iris_df["class"] == variety]
   
#make a scatter plot for this variety #make a scatter plot for this variety
variety_df.plot.scatter(x = "pet-width", y = "pet-length", label=variety, color=colors[i], marker=markers[i], xlim=(min_x, max_x), ylim=(min_y, max_y)) variety_df.plot.scatter(x = "pet-width", y = "pet-length", label=variety, color=colors[i], marker=markers[i], xlim=(min_x, max_x), ylim=(min_y, max_y))
``` ```
   
%% Output %% Output
   
   
   
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# Have to be VERY careful to not crop out data. # Have to be VERY careful to not crop out data.
# We'll talk about this next lecture. # We'll talk about this next lecture.
``` ```
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# Better yet, we could combine these. # Better yet, we could combine these.
``` ```
   
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
   
### We can make Subplots in plots, called an AxesSubplot, keyword ax ### We can make Subplots in plots, called an AxesSubplot, keyword ax
1. if AxesSuplot ax passed, then plot in that subplot 1. if AxesSuplot ax passed, then plot in that subplot
2. if ax is None, create a new AxesSubplot 2. if ax is None, create a new AxesSubplot
3. return AxesSubplot that was used 3. return AxesSubplot that was used
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# complete this code to make 3 plots in one # complete this code to make 3 plots in one
   
plot_area = None # don't change this...look at this variable in line 12 plot_area = None # don't change this...look at this variable in line 12
colors = ["blue", "green", "red"] colors = ["blue", "green", "red"]
markers = ["o", "^", "v"] markers = ["o", "^", "v"]
for i in range(len(varietes)): for i in range(len(varietes)):
variety = varietes[i] variety = varietes[i]
   
# make a df just of just the data for this variety # make a df just of just the data for this variety
variety_df = iris_df[iris_df["class"] == variety] variety_df = iris_df[iris_df["class"] == variety]
   
#make a scatter plot for this variety #make a scatter plot for this variety
plot_area = variety_df.plot.scatter(x = "pet-width", y = "pet-length", label=variety, color=colors[i], marker=markers[i], ax=plot_area) plot_area = variety_df.plot.scatter(x = "pet-width", y = "pet-length", label=variety, color=colors[i], marker=markers[i], ax=plot_area)
``` ```
   
%% Output %% Output
   
   
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
   
### Time-Permitting ### Time-Permitting
Plot this data in an interesting/meaningful way & identify any correlations. Plot this data in an interesting/meaningful way & identify any correlations.
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
students = pd.DataFrame({ students = pd.DataFrame({
"name": [ "name": [
"Cole", "Cole",
"Cynthia", "Cynthia",
"Alice", "Alice",
"Seth" "Seth"
], ],
"grade": [ "grade": [
"C", "C",
"AB", "AB",
"B", "B",
"BC" "BC"
], ],
"gpa": [ "gpa": [
2.0, 2.0,
3.5, 3.5,
3.0, 3.0,
2.5 2.5
], ],
"attendance": [ "attendance": [
4, 4,
11, 11,
10, 10,
6 6
], ],
"height": [ "height": [
68, 68,
66, 66,
60, 60,
72 72
] ]
}) })
students students
``` ```
   
%% Output %% Output
   
name grade gpa attendance height name grade gpa attendance height
0 Cole C 2.0 4 68 0 Cole C 2.0 4 68
1 Cynthia AB 3.5 11 66 1 Cynthia AB 3.5 11 66
2 Alice B 3.0 10 60 2 Alice B 3.0 10 60
3 Seth BC 2.5 6 72 3 Seth BC 2.5 6 72
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
# Min, Max, and Overall Difference in Student Height # Min, Max, and Overall Difference in Student Height
min_height = students["height"].min() min_height = students["height"].min()
max_height = students["height"].max() max_height = students["height"].max()
diff_height = max_height - min_height diff_height = max_height - min_height
   
# Normalize students heights on a scale of [0, 1] (black to white) # Normalize students heights on a scale of [0, 1] (black to white)
height_colors = (students["height"] - min_height) / diff_height height_colors = (students["height"] - min_height) / diff_height
   
# Normalize students heights on a scale of [0, 0.5] (black to gray) # Normalize students heights on a scale of [0, 0.5] (black to gray)
height_colors = height_colors / 2 height_colors = height_colors / 2
   
# Color must be a string (e.g. c='0.34') # Color must be a string (e.g. c='0.34')
height_colors = height_colors.astype("string") height_colors = height_colors.astype("string")
   
height_colors height_colors
``` ```
   
%% Output %% Output
   
0 0.3333333333333333 0 0.3333333333333333
1 0.25 1 0.25
2 0.0 2 0.0
3 0.5 3 0.5
Name: height, dtype: string Name: height, dtype: string
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
students.plot.scatter(x="attendance", y="gpa", c=height_colors) students.plot.scatter(x="attendance", y="gpa", c=height_colors)
``` ```
   
%% Output %% Output
   
<Axes: xlabel='attendance', ylabel='gpa'> <Axes: xlabel='attendance', ylabel='gpa'>
   
   
%% Cell type:code id: tags: %% Cell type:code id: tags:
   
``` python ``` python
students.corr() students.corr()
``` ```
   
%% Output %% Output
   
C:\Users\ctnelson1997\AppData\Local\Temp\ipykernel_12312\882796491.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. C:\Users\ctnelson1997\AppData\Local\Temp\ipykernel_20336\882796491.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
students.corr() students.corr()
   
gpa attendance height gpa attendance height
gpa 1.000000 0.976831 -0.464758 gpa 1.000000 0.976831 -0.464758
attendance 0.976831 1.000000 -0.635586 attendance 0.976831 1.000000 -0.635586
height -0.464758 -0.635586 1.000000 height -0.464758 -0.635586 1.000000
   
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
   
![image.png](attachment:image.png) ![image.png](attachment:image.png)
   
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
   
https://www.researchgate.net/publication/247907373_Stupid_Data_Miner_Tricks_Overfitting_the_SP_500 https://www.researchgate.net/publication/247907373_Stupid_Data_Miner_Tricks_Overfitting_the_SP_500
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment