From 67952a9a92d6f8304899182f730e35f02aab87ce Mon Sep 17 00:00:00 2001
From: gsingh58 <gurmail-singh@wisc.edu>
Date: Mon, 8 Apr 2024 05:50:59 -0500
Subject: [PATCH] lec24 updated

---
 .../24-clustering/24-clustering.ipynb         | 2093 +++++++++++++++++
 .../24-clustering/24-clustering_001.ipynb     |  916 ++++++++
 .../24-clustering/24-clustering_002.ipynb     |  916 ++++++++
 3 files changed, 3925 insertions(+)
 create mode 100644 lecture_material/24-clustering/24-clustering.ipynb
 create mode 100644 lecture_material/24-clustering/24-clustering_001.ipynb
 create mode 100644 lecture_material/24-clustering/24-clustering_002.ipynb

diff --git a/lecture_material/24-clustering/24-clustering.ipynb b/lecture_material/24-clustering/24-clustering.ipynb
new file mode 100644
index 0000000..05131a6
--- /dev/null
+++ b/lecture_material/24-clustering/24-clustering.ipynb
@@ -0,0 +1,2093 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "035e9e2c-9781-4b9c-8395-be9e55e4e082",
+   "metadata": {},
+   "source": [
+    "# Clustering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "cbd48a28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import geopandas as gpd\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "from sklearn import datasets\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.linear_model import LinearRegression, LogisticRegression\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.preprocessing import PolynomialFeatures\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "\n",
+    "# new import statements\n",
+    "from sklearn.cluster import KMeans, AgglomerativeClustering"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e72ea2f4",
+   "metadata": {},
+   "source": [
+    "# Unsupervised Machine Learning: Clustering\n",
+    "\n",
+    "- In classification (supervised), we try to find boundaries/rules to separate points according to pre-determined labels.\n",
+    "- In clustering, the algorithm chooses the labels.  Goal is to choose labels so that similar rows get labeled the same.\n",
+    "\n",
+    "### K-Means Clustering\n",
+    "\n",
+    "- K: number of clusters:\n",
+    "    - 3-Means => 3 clusters\n",
+    "    - 4-Means => 4 clusters, and so on\n",
+    "- Means: we will find centroids (aka means aka averages) to create clusters\n",
+    "\n",
+    "- import statement:\n",
+    "```python\n",
+    "from sklearn.cluster import KMeans\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a0ad5a5",
+   "metadata": {},
+   "source": [
+    "#### Iterative algorithm for K-Means"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0b83aaf3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>x0</th>\n",
+       "      <th>x1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>8.370099</td>\n",
+       "      <td>7.747045</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>-2.701740</td>\n",
+       "      <td>0.395336</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>-3.204128</td>\n",
+       "      <td>-0.407438</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>-3.132762</td>\n",
+       "      <td>-1.335692</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>7.152737</td>\n",
+       "      <td>6.069995</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         x0        x1\n",
+       "0  8.370099  7.747045\n",
+       "1 -2.701740  0.395336\n",
+       "2 -3.204128 -0.407438\n",
+       "3 -3.132762 -1.335692\n",
+       "4  7.152737  6.069995"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Generate random data\n",
+    "x, y = datasets.make_blobs(n_samples=100, centers=3, cluster_std=1.2, random_state=3)\n",
+    "df = pd.DataFrame(x, columns=[\"x0\", \"x1\"])\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "fbced908",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "def km_scatter(df, **kwargs):\n",
+    "    \"\"\"\n",
+    "    Produces scatter plot visualizations with x0 on x-axis and y0 on y-axis.\n",
+    "    It can also plot the centroids for clusters.\n",
+    "    Parameters:\n",
+    "        x0 => x-axis\n",
+    "        x1 => y-axis\n",
+    "        cluster => marker type\n",
+    "    \"\"\"\n",
+    "    ax = kwargs.pop(\"ax\", None)\n",
+    "    if not \"label\" in df.columns:\n",
+    "        return df.plot.scatter(x=\"x0\", y=\"x1\", marker=\"$?$\", ax=ax, **kwargs)\n",
+    "\n",
+    "    for marker in set(df[\"label\"]):\n",
+    "        sub_df = df[df[\"label\"] == marker]\n",
+    "        ax = sub_df.plot.scatter(x=\"x0\", y=\"x1\", marker=marker, ax=ax, **kwargs)\n",
+    "    return ax\n",
+    "\n",
+    "ax = km_scatter(df, s=100, c=\"0.7\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "47686eee",
+   "metadata": {},
+   "source": [
+    "### Hard Problem\n",
+    "\n",
+    "Finding the best answer. What is the answer? Determing the centroids of the clusters.\n",
+    "\n",
+    "### Easier Problem\n",
+    "\n",
+    "Taking a random answer and make it a little better. Then repeat!\n",
+    "Downside? If randomization leads to very bad initial choice of centroids, that might lead to bad clustering (fewer clusters)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "6f8bde9e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Axes: xlabel='x0', ylabel='x1'>"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "clusters = np.random.uniform(-5, 5, size=(3, 2))\n",
+    "clusters = pd.DataFrame(clusters, columns=[\"x0\", \"x1\"])\n",
+    "clusters[\"label\"] = [\"o\", \"+\", \"x\"]\n",
+    "\n",
+    "ax = km_scatter(df, s=100, c=\"0.7\")\n",
+    "km_scatter(clusters, s=200, c=\"red\", ax=ax)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a3fe986c",
+   "metadata": {},
+   "source": [
+    "Two variables for us to deal with:\n",
+    "1. clusters: contains location of centroids and a label for them\n",
+    "2. df: contains the actual data points"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "cfa1f1aa",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>x0</th>\n",
+       "      <th>x1</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>-3.318190</td>\n",
+       "      <td>3.427192</td>\n",
+       "      <td>o</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2.803704</td>\n",
+       "      <td>2.776817</td>\n",
+       "      <td>+</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.807126</td>\n",
+       "      <td>-2.515565</td>\n",
+       "      <td>x</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         x0        x1 label\n",
+       "0 -3.318190  3.427192     o\n",
+       "1  2.803704  2.776817     +\n",
+       "2  0.807126 -2.515565     x"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "clusters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "f210c534",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>x0</th>\n",
+       "      <th>x1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>8.370099</td>\n",
+       "      <td>7.747045</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>-2.701740</td>\n",
+       "      <td>0.395336</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>-3.204128</td>\n",
+       "      <td>-0.407438</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>-3.132762</td>\n",
+       "      <td>-1.335692</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>7.152737</td>\n",
+       "      <td>6.069995</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         x0        x1\n",
+       "0  8.370099  7.747045\n",
+       "1 -2.701740  0.395336\n",
+       "2 -3.204128 -0.407438\n",
+       "3 -3.132762 -1.335692\n",
+       "4  7.152737  6.069995"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "a28466ce",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "class KM:\n",
+    "    def __init__(self, df, clusters):\n",
+    "        # We make copies because we are going to keep changing the dataframe to \n",
+    "        # identify better clusters\n",
+    "        self.df = df.copy()\n",
+    "        self.clusters = clusters.copy()\n",
+    "        self.labels = clusters[\"label\"].values\n",
+    "        \n",
+    "    def plot(self):\n",
+    "        ax = km_scatter(self.df, color=\"0.7\", s=100)\n",
+    "        km_scatter(self.clusters, ax=ax, color=\"red\", s=200)\n",
+    "        \n",
+    "    def assign_points(self):\n",
+    "        \"\"\"\n",
+    "        compute Euclidean distance between each point and each centroids\n",
+    "        \"\"\"\n",
+    "        for center in self.clusters.itertuples():\n",
+    "            # Euclidean distance\n",
+    "            x0_diff = df[\"x0\"] - center.x0\n",
+    "            x1_diff = df[\"x1\"] - center.x1\n",
+    "            distances = (x0_diff ** 2 + x1_diff ** 2) ** 0.5\n",
+    "            # add distance to each centroid as a column within the dataframe\n",
+    "            self.df[center.label] = distances\n",
+    "        # get the label of the nearest centroid\n",
+    "        self.df[\"label\"] = self.labels[self.df[self.labels].values.argmin(axis=1)]\n",
+    "    \n",
+    "    def update_centers(self):\n",
+    "        \"\"\"\n",
+    "        update centroids by taking mean of the points that are nearest to that\n",
+    "        particular centroid\n",
+    "        \"\"\"\n",
+    "        for center in self.clusters.itertuples():\n",
+    "            subset_df = self.df[self.df[\"label\"] == center.label]\n",
+    "            if len(subset_df) > 0:\n",
+    "                x0 = subset_df[\"x0\"].mean()\n",
+    "                x1 = subset_df[\"x1\"].mean()\n",
+    "                self.clusters.at[center.Index, \"x0\"] = x0\n",
+    "                self.clusters.at[center.Index, \"x1\"] = x1\n",
+    "\n",
+    "\"\"\"\n",
+    "High-level algorithm:\n",
+    "1. Start with random locations for centroids\n",
+    "2. Iterate over each data point:\n",
+    "    1. Find the distance (Euclidean distance) between current data point and each centroid.\n",
+    "    2. Find the minimum of those distances and the corresponding label.\n",
+    "    3. Assign current data point to the closest cluster centroid label.\n",
+    "4. Once all points are assigned, compute new centroid for each cluster. Iterate over \n",
+    "   each cluster:\n",
+    "    1. Extract subset of data points which got assigned to curr cluster label.\n",
+    "    2. Compute mean of all the assigned data points.\n",
+    "    3. Update cluster centroid.\n",
+    "5. Repeat steps 2 to 4 many times (iterative improvement).\n",
+    "\"\"\"\n",
+    "\n",
+    "# Creating object instance\n",
+    "km = KM(df, clusters)\n",
+    "km.plot()\n",
+    "\n",
+    "# km.assign_points()\n",
+    "# km.plot()\n",
+    "# km.update_centers()\n",
+    "\n",
+    "for i in range(10):\n",
+    "    km.assign_points()\n",
+    "    km.update_centers()\n",
+    "    \n",
+    "km.plot()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "938a6cc5",
+   "metadata": {},
+   "source": [
+    "### `sklearn KMeans`\n",
+    "\n",
+    "- import statement:\n",
+    "```python\n",
+    "from sklearn.cluster import KMeans\n",
+    "```\n",
+    "- documentation: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html\n",
+    "\n",
+    "**Instantiation:**\n",
+    "`KMeans(n_clusters=<num>, n_init=<num>, max_iter=<num>)`\n",
+    "- `n_clusters`: number of clusters to be formed\n",
+    "- `n_init`: number of initial random seeds to try (to avoid downside of bad initial random choices)\n",
+    "- `max_iter`: maximum number of iterations for a single K-means run (single starting seed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "caa96a1e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style>#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"â–¸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"â–¾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>KMeans(n_clusters=3, n_init=320)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">KMeans</label><div class=\"sk-toggleable__content\"><pre>KMeans(n_clusters=3, n_init=320)</pre></div></div></div></div></div>"
+      ],
+      "text/plain": [
+       "KMeans(n_clusters=3, n_init=320)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "km_cluster = KMeans(3, n_init = 320)\n",
+    "km_cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "ea51243c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>x0</th>\n",
+       "      <th>x1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>8.370099</td>\n",
+       "      <td>7.747045</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>-2.701740</td>\n",
+       "      <td>0.395336</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>-3.204128</td>\n",
+       "      <td>-0.407438</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>-3.132762</td>\n",
+       "      <td>-1.335692</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>7.152737</td>\n",
+       "      <td>6.069995</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         x0        x1\n",
+       "0  8.370099  7.747045\n",
+       "1 -2.701740  0.395336\n",
+       "2 -3.204128 -0.407438\n",
+       "3 -3.132762 -1.335692\n",
+       "4  7.152737  6.069995"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "84e59c4a",
+   "metadata": {},
+   "source": [
+    "**Methods:**\n",
+    "1. `fit`: find good centroids\n",
+    "2. `transform`: give me the distances from each point to each centroid\n",
+    "3. `predict`: give me the chosen group labels\n",
+    "\n",
+    "**Attributes:**\n",
+    "- `<km object>.cluster_centers_`: coordinates of cluster centers\n",
+    "- `<km object>.inertia_`: sum of squared distances of samples to their closest cluster center"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "26be1744",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[ 0.85985598,  3.98556415],\n",
+       "       [ 7.69751168,  7.9241129 ],\n",
+       "       [-4.41347291,  0.43410278]])"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# `fit`: find good centroids\n",
+    "km_cluster.fit(df)\n",
+    "# coordinates of cluster centers\n",
+    "km_cluster.cluster_centers_"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6ce05e61",
+   "metadata": {},
+   "source": [
+    "**Observeration:** 3 rows (because we have 3 clusters), and 2 columns (because the df had 2 columns)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "2df977a4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[ 8.39955315,  0.69550479, 14.72748598],\n",
+       "       [ 5.057144  , 12.83849367,  1.71217188],\n",
+       "       [ 5.984516  , 13.72080475,  1.47333179],\n",
+       "       [ 6.65257594, 14.24916908,  2.18458064],\n",
+       "       [ 6.62911793,  1.93249405, 12.86625407],\n",
+       "       [ 6.42867089, 14.14472291,  1.56004975],\n",
+       "       [ 7.00663718, 14.79157898,  1.29983259],\n",
+       "       [ 7.09747529,  1.56332001, 13.35025948],\n",
+       "       [ 6.1620119 , 13.93648735,  1.20010768],\n",
+       "       [ 1.75079801,  9.23940355,  5.12923462],\n",
+       "       [ 7.45185474, 15.29849451,  1.23053441],\n",
+       "       [ 8.98045036,  1.52459286, 15.3322184 ],\n",
+       "       [10.91386222,  3.37380113, 17.16390523],\n",
+       "       [ 8.05146158,  0.29966036, 14.40658807],\n",
+       "       [ 1.50499677,  6.61820665,  7.66113117],\n",
+       "       [ 5.52922108, 13.2665394 ,  2.37571603],\n",
+       "       [ 8.29635389,  0.53707918, 14.65282383],\n",
+       "       [ 7.93015965,  1.55598141, 14.25952349],\n",
+       "       [ 5.48003982, 13.29440299,  1.90656095],\n",
+       "       [ 7.2343239 , 15.0936722 ,  0.96395695],\n",
+       "       [ 1.41855649,  9.25115661,  4.992096  ],\n",
+       "       [ 1.45498683,  9.33747036,  4.93551825],\n",
+       "       [ 5.44776095, 13.23976135,  2.07379647],\n",
+       "       [ 2.19458147,  9.56765164,  4.91887166],\n",
+       "       [ 6.93266631, 14.81316619,  0.57581927],\n",
+       "       [ 7.17892147,  1.11200436, 13.53348077],\n",
+       "       [ 7.67472596,  2.34433677, 13.93980529],\n",
+       "       [ 1.93352254,  9.20846006,  5.26299436],\n",
+       "       [ 6.70061053, 14.59106753,  0.65237687],\n",
+       "       [ 7.12823278,  1.94521438, 13.32557282],\n",
+       "       [ 0.74526095,  8.30288243,  6.05203173],\n",
+       "       [ 0.40860041,  8.00815579,  6.23474579],\n",
+       "       [ 9.49044337,  1.60009447, 15.83753761],\n",
+       "       [ 6.22489679, 14.10790541,  0.87321819],\n",
+       "       [ 8.435192  ,  1.15378784, 14.72414628],\n",
+       "       [10.2531705 ,  2.38736888, 16.58783267],\n",
+       "       [ 2.27154202,  5.83830901,  8.55105873],\n",
+       "       [ 2.75660235,  5.95202527,  8.78629101],\n",
+       "       [ 2.84287347, 10.1425873 ,  4.54066469],\n",
+       "       [ 7.16356495,  0.93262286, 13.5212101 ],\n",
+       "       [ 9.19576267,  1.30641878, 15.54221673],\n",
+       "       [ 5.34666581,  2.54971515, 11.7015581 ],\n",
+       "       [ 2.54587855, 10.22941308,  4.36721141],\n",
+       "       [ 1.88491159,  8.68824585,  6.14751601],\n",
+       "       [ 7.33668275, 15.17700603,  1.1667762 ],\n",
+       "       [ 5.45233018, 13.30733704,  1.01799729],\n",
+       "       [ 1.7579973 ,  7.49072887,  7.03318297],\n",
+       "       [ 9.86402785, 17.75488558,  3.54826072],\n",
+       "       [ 7.2031497 ,  1.20887179, 13.55443098],\n",
+       "       [ 5.87271591, 13.74675135,  0.51824276],\n",
+       "       [ 2.30838073,  7.10823668,  7.82838283],\n",
+       "       [ 7.80457898,  1.12270374, 14.15307497],\n",
+       "       [ 6.4156169 , 14.24917525,  1.59442989],\n",
+       "       [ 6.6822816 ,  1.3318725 , 13.03994358],\n",
+       "       [ 6.92982586, 14.80507705,  0.59024245],\n",
+       "       [ 8.63436577,  1.42658402, 14.90539467],\n",
+       "       [ 5.41291689, 13.29593667,  1.25147313],\n",
+       "       [ 8.54636085,  1.19827726, 14.89915191],\n",
+       "       [ 0.57780829,  8.14567593,  6.10488957],\n",
+       "       [ 0.77282299,  8.60942142,  5.67744983],\n",
+       "       [ 2.59528033,  6.32091816,  8.49307124],\n",
+       "       [ 5.35252808, 13.23709537,  1.00610963],\n",
+       "       [ 2.26369124,  7.07230771,  7.5990801 ],\n",
+       "       [ 1.26785392,  9.0405774 ,  5.30885047],\n",
+       "       [ 5.99596267, 13.88620068,  0.47948765],\n",
+       "       [ 0.91763133,  8.58111489,  5.77604558],\n",
+       "       [ 6.65464378,  2.06300507, 12.86777741],\n",
+       "       [ 6.39246794, 14.09638583,  1.62566554],\n",
+       "       [ 1.65291205,  8.35369533,  6.38383554],\n",
+       "       [ 7.53920353, 15.42660006,  1.41490518],\n",
+       "       [ 6.17924199, 14.06223865,  0.17931956],\n",
+       "       [ 8.31324068,  0.59836973, 14.67041305],\n",
+       "       [ 1.99362004,  6.00685668,  8.25397394],\n",
+       "       [ 8.34952026, 16.1079315 ,  2.4063456 ],\n",
+       "       [ 0.37645384,  7.86635161,  6.37596316],\n",
+       "       [ 1.26889021,  9.1588198 ,  5.08950884],\n",
+       "       [ 6.67830608, 14.56800175,  0.69981327],\n",
+       "       [ 0.77020788,  8.64186223,  5.59931912],\n",
+       "       [ 7.79244516,  0.28968299, 14.14799736],\n",
+       "       [ 5.12907373, 12.98613124,  1.30572045],\n",
+       "       [ 2.53789955,  5.87809732,  8.68870984],\n",
+       "       [ 2.36585725,  8.24866952,  6.61574395],\n",
+       "       [ 2.12612074,  5.87257865,  8.45067852],\n",
+       "       [ 0.90652387,  7.12826473,  7.11997028],\n",
+       "       [ 7.26336958,  0.71023041, 13.61995075],\n",
+       "       [ 1.15089783,  8.22238495,  6.12509618],\n",
+       "       [ 7.02568051, 14.91475526,  0.95000417],\n",
+       "       [ 6.83520257,  1.30571326, 13.19097587],\n",
+       "       [ 2.02969683,  9.13122664,  5.6933789 ],\n",
+       "       [ 7.27004865,  0.73792958, 13.60223177],\n",
+       "       [ 1.82317591,  6.1130832 ,  8.1696311 ],\n",
+       "       [ 7.42721394,  0.4748222 , 13.7794429 ],\n",
+       "       [10.81513449,  2.92436629, 17.16231237],\n",
+       "       [ 7.03701825, 14.8381987 ,  2.0476452 ],\n",
+       "       [ 7.26300626,  0.79864959, 13.62070697],\n",
+       "       [ 6.25392175, 14.14191977,  0.70491739],\n",
+       "       [ 5.50220436, 13.35168297,  1.00665632],\n",
+       "       [ 0.30480731,  8.16896184,  6.07336225],\n",
+       "       [ 7.63813717,  0.63263536, 13.95950443],\n",
+       "       [ 8.10973138,  1.16783284, 14.38622797]])"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# `transform`: give me the distances from each point to each centroid\n",
+    "km_cluster.transform(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7cd8409e",
+   "metadata": {},
+   "source": [
+    "**Observations**: Each row corresponds to a row in df. 3 columns correspond to 3 distances to the centroids."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "6a65a976",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([1, 2, 2, 2, 1, 2, 2, 1, 2, 0, 2, 1, 1, 1, 0, 2, 1, 1, 2, 2, 0, 0,\n",
+       "       2, 0, 2, 1, 1, 0, 2, 1, 0, 0, 1, 2, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,\n",
+       "       2, 2, 0, 2, 1, 2, 0, 1, 2, 1, 2, 1, 2, 1, 0, 0, 0, 2, 0, 0, 2, 0,\n",
+       "       1, 2, 0, 2, 2, 1, 0, 2, 0, 0, 2, 0, 1, 2, 0, 0, 0, 0, 1, 0, 2, 1,\n",
+       "       0, 1, 0, 1, 1, 2, 1, 2, 2, 0, 1, 1], dtype=int32)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# `predict`: give me the chosen group labels\n",
+    "km_cluster.predict(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "240d995a",
+   "metadata": {},
+   "source": [
+    "### How many clusters do we need?\n",
+    "\n",
+    "- metric: `<km object>.inertia_`: sum of squared distances of samples to their closest cluster center"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "8bf73d2c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "260.7196850565891"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "km_cluster.inertia_"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "57b5ccc4",
+   "metadata": {},
+   "source": [
+    "**Observation**: we want \"inertia\" to be as small as possible."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae70b416",
+   "metadata": {},
+   "source": [
+    "### Elbow plot to determine `n_clusters`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "607a96b0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1     3621.095890\n",
+       "2      927.007644\n",
+       "3      260.719685\n",
+       "4      211.730031\n",
+       "5      180.483456\n",
+       "6      152.850252\n",
+       "7      128.980812\n",
+       "8      109.222545\n",
+       "9       92.946695\n",
+       "10      82.172702\n",
+       "dtype: float64"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# create a series with clusters 1 to 10 and corresponding values are equal to intertia \n",
+    "s = pd.Series(dtype=float)\n",
+    "\n",
+    "for num_clusters in range(1, 11):\n",
+    "    km = KMeans(num_clusters, n_init = 320)\n",
+    "    km.fit(df)\n",
+    "    s.at[num_clusters] = km.inertia_\n",
+    "\n",
+    "s"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "388cd23f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Text(0.5, 0, 'Number of clusters')"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 600x400 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ax = s.plot.line(figsize=(6, 4))\n",
+    "ax.set_ylabel(\"Inertia\")\n",
+    "ax.set_xlabel(\"Number of clusters\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eab497cd",
+   "metadata": {},
+   "source": [
+    "**Observation**: there is an \"elbow\" around `n_clusters`=3."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e763d1c",
+   "metadata": {},
+   "source": [
+    "#### Will we always have a clear \"elbow\"?\n",
+    "\n",
+    "- Let's generate uniform random data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "b5ad30ec",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Text(0.5, 0, 'Number of clusters')"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiUAAAFzCAYAAADhUnmcAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/xnp5ZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABL50lEQVR4nO3deVwTd/4/8FdCSDgTDiUhckjrBYqIZ1Frd1d+4tm6tYeWVWutblu19airtvXoJWpP7dfq2u1Wd+vR7las2tVKvaiKiCgeqHgUAcWAciQcciXz+wNJTcGKCE6O1/PxmIdm5pPJe7K75rUz7/mMRBAEAUREREQik4pdABERERHAUEJERERWgqGEiIiIrAJDCREREVkFhhIiIiKyCgwlREREZBUYSoiIiMgqMJQQERGRVZCJXYCtMJlMyM3NhaenJyQSidjlEBER2QxBEFBSUgKtVgup9M7nQxhKGik3NxeBgYFil0FERGSzcnJyEBAQcMftDCWN5OnpCaD2C1UqlSJXQ0REZDsMBgMCAwPNv6V3ImooSUxMxAcffIDU1FRcu3YN8fHxGDlypMWYs2fPYs6cOdi/fz9qamoQFhaG7777DkFBQQCAiooKzJo1C5s2bUJlZSViYmLw+eefQ61Wm/eRnZ2Nl19+GXv37oWHhwfGjx+PuLg4yGSNP/y6SzZKpZKhhIiIqAnu1v4gaqNrWVkZIiIisHLlyga3X7p0Cf3790enTp2wb98+nDx5EvPnz4eLi4t5zIwZM7Bt2zb85z//wf79+5Gbm4snn3zSvN1oNGLYsGGoqqrCoUOHsG7dOqxduxYLFixo8eMjIiKixpNYy1OCJRJJvTMlo0ePhrOzM/797383+B69Xo/WrVtjw4YNeOqppwAA586dQ2hoKJKSkvDII49gx44dGD58OHJzc81nT1avXo05c+bg+vXrkMvljarPYDBApVJBr9fzTAkREdE9aOxvqNXeEmwymfDDDz+gQ4cOiImJgZ+fH/r06YMtW7aYx6SmpqK6uhrR0dHmdZ06dUJQUBCSkpIAAElJSQgPD7e4nBMTEwODwYD09PQ7fn5lZSUMBoPFQkRERC3HakNJfn4+SktLsWTJEgwePBi7du3Cn//8Zzz55JPYv38/AECn00Eul8PLy8vivWq1Gjqdzjzm9kBSt71u253ExcVBpVKZF955Q0RE1LKsNpSYTCYAwBNPPIEZM2agW7dumDt3LoYPH47Vq1e3+OfPmzcPer3evOTk5LT4ZxIRETkyqw0lrVq1gkwmQ1hYmMX60NBQZGdnAwA0Gg2qqqpQXFxsMSYvLw8ajcY8Ji8vr972um13olAozHfa8I4bIiKilme1oUQul6NXr17IyMiwWH/+/HkEBwcDAHr06AFnZ2fs3r3bvD0jIwPZ2dmIiooCAERFReHUqVPIz883j0lISIBSqawXeIiIiEg8os5TUlpaiosXL5pfZ2ZmIi0tDT4+PggKCsLs2bPx7LPPYsCAAfjjH/+InTt3Ytu2bdi3bx8AQKVSYeLEiZg5cyZ8fHygVCoxbdo0REVF4ZFHHgEADBo0CGFhYRg7diyWLVsGnU6Ht956C1OmTIFCoRDjsImIiKghgoj27t0rAKi3jB8/3jzmyy+/FNq1aye4uLgIERERwpYtWyz2cfPmTeGVV14RvL29BTc3N+HPf/6zcO3aNYsxly9fFoYMGSK4uroKrVq1EmbNmiVUV1ffU616vV4AIOj1+iYfLxERkSNq7G+o1cxTYu2ae56SqhoTdp3RwV/lih7B3s1QIRERkXWy+XlK7N3y3ecxdcNxrNp38e6DiYiIHABDiUie7F77lMQ95/Kh01eIXA0REZH4GEpE8nBrD/QJ8YFJAL49yjlQiIiIGEpENKZ37ZOOv0nJgdHE1h4iInJsDCUiGtxFA5WrM64W30Tihetil0NERCQqhhIRuTg7YdSt3pKNydkiV0NERCQuhhKRjeld+6C/3efykW9gwysRETkuhhKRtVd7oldbbxhNAv6TekXscoiIiETDUGIFRveqbXjdeCQbJja8EhGRg2IosQLDuvpD6SLDlaKbOHDxhtjlEBERiYKhxAq4ODuZJ1PbeIQNr0RE5JgYSqzE6FsNrwln8nC9pFLkaoiIiB48hhIr0UmjRPcgL9SYBPyXDa9EROSAGEqsyOhbM7xuSmHDKxEROR6GEisyvKs/PBUyZBWUI+mXArHLISIieqAYSqyIm1yGkZFtAAAb2PBKREQOhqHEytQ9pG9Xug4FpWx4JSIix8FQYmXCtEpEBHqh2ijgu2NseCUiIsfBUGKFxvSqvT1445EcCAIbXomIyDEwlFihERFauMudkHmjDId/KRS7HCIiogeCocQKuStkeOJWwytneCUiIkfBUGKlnrvV8LrztA5FZVUiV0NERNTyGEqsVJc2KoS3UaHKaGLDKxEROQSGEitW9zycjUey2fBKRER2j6HEij0eoYWb3AmXrpch5XKR2OUQERG1KIYSK+bp4ozHI7QA2PBKRET2j6HEytXN8PrDqWsoLmfDKxER2S+GEivXNUCFMH8lqmpMiD9+VexyiIiIWoyooSQxMREjRoyAVquFRCLBli1b7jj2pZdegkQiwaeffmqxvrCwELGxsVAqlfDy8sLEiRNRWlpqMebkyZN49NFH4eLigsDAQCxbtqwFjqZlSCQSjGHDKxEROQBRQ0lZWRkiIiKwcuXK3x0XHx+Pw4cPQ6vV1tsWGxuL9PR0JCQkYPv27UhMTMTkyZPN2w0GAwYNGoTg4GCkpqbigw8+wKJFi7BmzZpmP56W8kRkG7g4S3E+rxTHstnwSkRE9kkm5ocPGTIEQ4YM+d0xV69exbRp0/Djjz9i2LBhFtvOnj2LnTt3IiUlBT179gQAfPbZZxg6dCg+/PBDaLVarF+/HlVVVfjnP/8JuVyOzp07Iy0tDR9//LFFeLFmShdnjOiqxX9Sr2BDcg56BPuIXRIREVGzs+qeEpPJhLFjx2L27Nno3Llzve1JSUnw8vIyBxIAiI6OhlQqRXJysnnMgAEDIJfLzWNiYmKQkZGBoqI7n3WorKyEwWCwWMQ0pk9dw2su9DerRa2FiIioJVh1KFm6dClkMhleffXVBrfrdDr4+flZrJPJZPDx8YFOpzOPUavVFmPqXteNaUhcXBxUKpV5CQwMvJ9DuW+RgV7opPFERbUJ36ex4ZWIiOyP1YaS1NRULF++HGvXroVEInngnz9v3jzo9XrzkpOT88BruJ1EIsHoXrXBaEMyG16JiMj+WG0o+fnnn5Gfn4+goCDIZDLIZDJkZWVh1qxZaNu2LQBAo9EgPz/f4n01NTUoLCyERqMxj8nLy7MYU/e6bkxDFAoFlEqlxSK2P0cGQCGT4pyuBGk5xWKXQ0RE1KysNpSMHTsWJ0+eRFpamnnRarWYPXs2fvzxRwBAVFQUiouLkZqaan7fnj17YDKZ0KdPH/OYxMREVFf/2oeRkJCAjh07wtvb+8Ee1H1SuTljWFd/AJzhlYiI7I+ooaS0tNQcOAAgMzMTaWlpyM7Ohq+vL7p06WKxODs7Q6PRoGPHjgCA0NBQDB48GJMmTcKRI0dw8OBBTJ06FaNHjzbfPvzcc89BLpdj4sSJSE9PxzfffIPly5dj5syZYh32fXnu1gyv205cQ0kFG16JiMh+iBpKjh49isjISERGRgIAZs6cicjISCxYsKDR+1i/fj06deqEgQMHYujQoejfv7/FHCQqlQq7du1CZmYmevTogVmzZmHBggU2czvwb/UI9kZ7Pw/crDbi+7RcscshIiJqNhKBHZONYjAYoFKpoNfrRe8v+fJAJt7dfgZh/kr88Gp/URqBiYiIGquxv6FW21NCd/ZkZBvIZVKcuWbAqat6scshIiJqFgwlNsjbXY6hXWrvHGLDKxER2QuGEhs15lbD6/dpuSitrBG5GiIiovvHUGKjeof44KHW7iivMmLbCTa8EhGR7WMosVESiQRjetWeLeElHCIisgcMJTZsVI8AyJ2kOHlFj9NseCUiIhvHUGLDfNzliGHDKxER2QmGEhs3pnftQ/q+T8tFGRteiYjIhjGU2Lioh3zR1tcNpZU1+OHkNbHLISIiajKGEhsnkUgw+tbtwRt4CYeIiGwYQ4kdeKpHAJydJEjLKcaZXIPY5RARETUJQ4kdaOWhwKCw2obXTSk8W0JERLaJocRO1M3wGn/8Km5WGUWuhoiI6N4xlNiJvg/7IsjHDSUVNfjhFBteiYjI9jCU2AmpVIJne9XeHsw5S4iIyBYxlNiRp3sGQCaVIDWrCBm6ErHLISIiuicMJXbEz9MF0aFqADxbQkREtoehxM6M6VPb8Lr52BVUVLPhlYiIbAdDiZ15tF0rtPFyhaGiBv9jwysREdkQhhI7I5VKMPpWw+umIzkiV0NERNR4DCV26OmegXCSSnDkciEu5rPhlYiIbANDiR3SqFzwp05+AICNPFtCREQ2gqHETj13a4bX79jwSkRENoKhxE4N6NAaWpULisur8WO6TuxyiIiI7oqhxE45SSV4hjO8EhGRDWEosWPP9AyEVAIc/qUQv1wvFbscIiKi38VQYse0Xq74Y8fahtdNKWx4JSIi6yZqKElMTMSIESOg1WohkUiwZcsW87bq6mrMmTMH4eHhcHd3h1arxbhx45Cbm2uxj8LCQsTGxkKpVMLLywsTJ05EaanlWYGTJ0/i0UcfhYuLCwIDA7Fs2bIHcXhWYcythtf/pl5BZQ0bXomIyHqJGkrKysoQERGBlStX1ttWXl6OY8eOYf78+Th27Bg2b96MjIwMPP744xbjYmNjkZ6ejoSEBGzfvh2JiYmYPHmyebvBYMCgQYMQHByM1NRUfPDBB1i0aBHWrFnT4sdnDf7QsTXUSgUKy6qwKz1P7HKIiIjuSCIIgiB2EQAgkUgQHx+PkSNH3nFMSkoKevfujaysLAQFBeHs2bMICwtDSkoKevbsCQDYuXMnhg4diitXrkCr1WLVqlV48803odPpIJfLAQBz587Fli1bcO7cuUbXZzAYoFKpoNfroVQq7+tYH7SPd2VgxZ6L6NfOF+tffETscoiIyME09jfUpnpK9Ho9JBIJvLy8AABJSUnw8vIyBxIAiI6OhlQqRXJysnnMgAEDzIEEAGJiYpCRkYGioqIHWr9YnukVCIkEOHixAJdvlIldDhERUYNsJpRUVFRgzpw5GDNmjDll6XQ6+Pn5WYyTyWTw8fGBTqczj1Gr1RZj6l7XjWlIZWUlDAaDxWKrArzd8FiH1gDY8EpERNbLJkJJdXU1nnnmGQiCgFWrVj2Qz4yLi4NKpTIvgYGBD+RzW8qvDa85qKoxiVwNERFRfVYfSuoCSVZWFhISEiyuRWk0GuTn51uMr6mpQWFhITQajXlMXp5lg2fd67oxDZk3bx70er15ycmx7TMMf+rkh9aeCtworcJPZ9nwSkRE1seqQ0ldILlw4QJ++ukn+Pr6WmyPiopCcXExUlNTzev27NkDk8mEPn36mMckJiaiurraPCYhIQEdO3aEt7f3HT9boVBAqVRaLLbM2UmKZ3oGAOAMr0REZJ1EDSWlpaVIS0tDWloaACAzMxNpaWnIzs5GdXU1nnrqKRw9ehTr16+H0WiETqeDTqdDVVUVACA0NBSDBw/GpEmTcOTIERw8eBBTp07F6NGjodVqAQDPPfcc5HI5Jk6ciPT0dHzzzTdYvnw5Zs6cKdZhi2Z0r9pLOD9fuIGcwnKRqyEiIvoNQUR79+4VANRbxo8fL2RmZja4DYCwd+9e8z4KCgqEMWPGCB4eHoJSqRQmTJgglJSUWHzOiRMnhP79+wsKhUJo06aNsGTJknuuVa/XCwAEvV5/v4ctqr/847AQPGe7sGznWbFLISIiB9HY31CrmafE2tnyPCW323HqGl5efwytPRU4NPdPcHay6it4RERkB+xynhK6fwND1WjlIcf1kkrsPpt/9zcQERE9IAwlDkYuk+KpHrW3N29KYcMrERFZD4YSBzS6V20o2X/+Oq4UseGViIisA0OJA2rbyh392vlCEIBvOcMrERFZCYYSB1U3w+s3R3NQY+QMr0REJD6GEgf1/8LU8HGXI89Qib0Z18Uuh4iIiKHEUSlkTniqR+0Mr5s4wysREVkBhhIHVtfwujcjH7nFN0WuhoiIHB1DiQN7qLUHHnnIByYB+PYoG16JiEhcDCUOztzwmpIDo4mT+xIRkXgYShxcTGcNvNyccU1fgf3nOcMrERGJh6HEwbk4O2FU99qG1w3JvIRDRETiYSghjOn9a8OrTl8hcjVEROSoGEoI7fw80butD4wmAf9hwysREYmEoYQAAGP61D2kjw2vREQkDoYSAgAM6eIPpYsMV4tv4ucLnOGViIgePIYSAlDb8PrkrYbXjZzhlYiIRMBQQmZ1c5bsPpuPfAMbXomI6MFiKCGzjhpP9Aj2Ro1JwH9Sr4hdDhERORiGErJQd7ZkU0o2TGx4JSKiB4ihhCwMC/eHp4sMOYU3cfDSDbHLISIiB8JQQhZc5U74c2QbAGx4JSKiB4uhhOoZ3av2Es6u9DxcL6kUuRoiInIUDCVUT5hWiW6BXqgxCfjuGBteiYjowWAooQY9V9fweoQNr0RE9GAwlFCDhkf4w0Mhw+WCchz+pUDscoiIyAEwlFCD3OQyPNFNCwDYwIZXIiJ6ABhK6I7q5izZlZ6HglI2vBIRUcsSNZQkJiZixIgR0Gq1kEgk2LJli8V2QRCwYMEC+Pv7w9XVFdHR0bhw4YLFmMLCQsTGxkKpVMLLywsTJ05EaWmpxZiTJ0/i0UcfhYuLCwIDA7Fs2bKWPjS70KWNCl0DVKgymrD52FWxyyEiIjsnaigpKytDREQEVq5c2eD2ZcuWYcWKFVi9ejWSk5Ph7u6OmJgYVFT8+lyW2NhYpKenIyEhAdu3b0diYiImT55s3m4wGDBo0CAEBwcjNTUVH3zwARYtWoQ1a9a0+PHZg7qzJRuPZEMQ2PBKREQtSLASAIT4+Hjza5PJJGg0GuGDDz4wrysuLhYUCoWwceNGQRAE4cyZMwIAISUlxTxmx44dgkQiEa5evSoIgiB8/vnngre3t1BZWWkeM2fOHKFjx473VJ9erxcACHq9vimHZ7NKKqqF0Pk7hOA524WkSzfELoeIiGxQY39DrbanJDMzEzqdDtHR0eZ1KpUKffr0QVJSEgAgKSkJXl5e6Nmzp3lMdHQ0pFIpkpOTzWMGDBgAuVxuHhMTE4OMjAwUFRXd8fMrKythMBgsFkfkofi14ZUzvBIRUUuy2lCi0+kAAGq12mK9Wq02b9PpdPDz87PYLpPJ4OPjYzGmoX3c/hkNiYuLg0qlMi+BgYH3d0A2rO4Szo7TOhSVVYlcDRER2SurDSVimzdvHvR6vXnJyckRuyTRhLdRobNWiaoaEzYfZ8MrERG1DKsNJRqNBgCQl5dnsT4vL8+8TaPRID8/32J7TU0NCgsLLcY0tI/bP6MhCoUCSqXSYnFUEomEDa9ERNTirDaUhISEQKPRYPfu3eZ1BoMBycnJiIqKAgBERUWhuLgYqamp5jF79uyByWRCnz59zGMSExNRXV1tHpOQkICOHTvC29v7AR2N7Xuimxauzk64mF+Ko1l37sUhIiJqKlFDSWlpKdLS0pCWlgagtrk1LS0N2dnZkEgkmD59Ot577z1s3boVp06dwrhx46DVajFy5EgAQGhoKAYPHoxJkybhyJEjOHjwIKZOnYrRo0dDq61tznzuuecgl8sxceJEpKen45tvvsHy5csxc+ZMkY7aNnm6OGNEhD8AYGMyG16JiKgFPJibgRq2d+9eAUC9Zfz48YIg1N4WPH/+fEGtVgsKhUIYOHCgkJGRYbGPgoICYcyYMYKHh4egVCqFCRMmCCUlJRZjTpw4IfTv319QKBRCmzZthCVLltxzrY56S/DtjmUVCsFztgsd3vyfUFxWJXY5RERkIxr7GyoRBDYINIbBYIBKpYJer3fY/hJBEDBk+c84pyvBohFheL5fiNglERGRDWjsb6jV9pSQ9ZFIJHiuT13Daw4bXomIqFkxlNA9eaJbG7g4S5GRV4Jj2cVil0NERHaEoYTuicrVGcPCa5uI/3kwU+RqiIjInjCU0D17oX9bSCTADyev4fAvBWKXQ0REdoKhhO5ZZ60Kz92aTG3h9+moMZpEroiIiOwBQwk1yeyYjvB2c0ZGXgn+lZQldjlERGQHGEqoSbzc5Jgd0wkA8EnCeVwvqRS5IiIisnWypr7x6NGj+Pbbb5GdnY2qKssnx27evPm+CyPr92yvQGxKycbJK3os2XEOHz0TIXZJRERkw5p0pmTTpk3o27cvzp49i/j4eFRXVyM9PR179uyBSqVq7hrJSjlJJXj78c4AgO+OXUFqVqHIFRERkS1rUihZvHgxPvnkE2zbtg1yuRzLly/HuXPn8MwzzyAoKKi5ayQrFhnkjWd7BgIAFnyfDqOJE6oREVHTNCmUXLp0CcOGDQMAyOVylJWVQSKRYMaMGVizZk2zFkjW72+DO0LpIkN6rgEbjvBhfURE1DRNCiXe3t4oKSkBALRp0wanT58GABQXF6O8vLz5qiOb4OuhwOsxHQEAH/6YgcKyqru8g4iIqL4mhZIBAwYgISEBAPD000/jtddew6RJkzBmzBgMHDiwWQsk2/Bc7yCE+iuhv1mND348J3Y5RERkg5r0lODCwkJUVFRAq9XCZDJh2bJlOHToENq3b4+33noL3t7eLVGrqPiU4Ls7erkQT61OgkQCbHmlHyICvcQuiYiIrEBjf0ObFEocEUNJ48z8Jg2bj19FRIAK8a/0g1QqEbskIiISWWN/Qxt9+cZgMFj8/fcWclxzh3aCh0KGE1f0+PZojtjlEBGRDWl0KPH29kZ+fj4AwMvLC97e3vWWuvXkuPw8XTA9uj0AYOnOcyguZ9MrERE1TqNndN2zZw98fHwAAHv37m2xgsj2je/bFt8ezcH5vFJ8tOs83h3ZReySiIjIBjSppyQ7OxuBgYGQSCz7BQRBQE5Ojl1OoMaeknuTdKkAY744DKkE2Dq1P7q04Uy/RESOqtl7Sm4XEhKC69ev11tfWFiIkJCQpuyS7EzUw74YEaGFSQAWfH8aJs70SkREd9GkUCIIQr2zJABQWloKFxeX+y6K7MMbQzvBTe6EY9nF2Hz8qtjlEBGRlbunpwTPnDkTACCRSDB//ny4ubmZtxmNRiQnJ6Nbt27NWiDZLn+VK14d2B5LdpzDkh1nMaizGkoXZ7HLIiIiK3VPoeT48eMAas+UnDp1CnK53LxNLpcjIiICr7/+evNWSDbthX4h+PZoDn65XoZPEs5j4YjOYpdERERWqkmNrhMmTMCKFSvg6enZEjVZJTa6Nl3i+esY988jcJJK8MOr/dFJw++PiMiRtFija3V1Nf79738jKyvrvgokxzGgQ2sM7qyB0SRgwffp4CTCRETUkHsOJc7OzggKCoLRaGyJeshOvTU8FC7OUhzJLMTWE7lil0NERFaoSXffvPnmm3jjjTdQWFjY3PWQnQrwdsOUP7QDACz+31mUVtaIXBEREVmbJvWUREZG4uLFi6iurkZwcDDc3d0tth87dqzZCrQW7Cm5fxXVRsR8moisgnL8dcBDmDc0VOySiIjoAWjsb+g93X1TZ+TIkU2t654YjUYsWrQIX3/9NXQ6HbRaLZ5//nm89dZb5nlSBEHAwoUL8cUXX6C4uBj9+vXDqlWr0L59e/N+CgsLMW3aNGzbtg1SqRSjRo3C8uXL4eHh8UCOg2q5ODth4YgwvLD2KL48kImnewagnZ/jNEsTEdHva1IoWbhwYXPX0aClS5di1apVWLduHTp37oyjR49iwoQJUKlUePXVVwEAy5Ytw4oVK7Bu3TqEhIRg/vz5iImJwZkzZ8wTucXGxuLatWtISEhAdXU1JkyYgMmTJ2PDhg0P5DjoV3/qpEZ0qB9+OpuPRVvP4N8Tezc4ER8RETmeJl2+AYDi4mL897//xaVLlzB79mz4+Pjg2LFjUKvVaNOmTbMUN3z4cKjVanz55ZfmdaNGjYKrqyu+/vprCIIArVaLWbNmmedH0ev1UKvVWLt2LUaPHo2zZ88iLCwMKSkp6NmzJwBg586dGDp0KK5cuQKtVtuoWnj5pvlkF5Qj+pP9qKox4fPY7hga7i92SURE1IJa9Nk3J0+eRIcOHbB06VJ8+OGHKC4uBgBs3rwZ8+bNa1LBDenbty92796N8+fPAwBOnDiBAwcOYMiQIQCAzMxM6HQ6REdHm9+jUqnQp08fJCUlAQCSkpLg5eVlDiQAEB0dDalUiuTk5Dt+dmVlJQwGg8VCzSPI1w0vPfYwAOC97WdQXsWmVyIiamIomTlzJp5//nlcuHDB4lk3Q4cORWJiYrMVN3fuXIwePRqdOnWCs7MzIiMjMX36dMTGxgIAdDodAECtVlu8T61Wm7fpdDr4+flZbJfJZPDx8TGPaUhcXBxUKpV5CQwMbLbjIuDlxx5GGy9X5OorsHLvRbHLISIiK9CkUJKSkoK//vWv9da3adPmd3/o79W3336L9evXY8OGDTh27BjWrVuHDz/8EOvWrWu2z7iTefPmQa/Xm5ecnJwW/0xH4ip3woIRYQCALxIzkXmjTOSKiIhIbE0KJQqFosHLGefPn0fr1q3vu6g6s2fPNp8tCQ8Px9ixYzFjxgzExcUBADQaDQAgLy/P4n15eXnmbRqNBvn5+Rbba2pqUFhYaB7TEIVCAaVSabFQ8xoUpsaADq1RZTTh7W2c6ZWIyNE1KZQ8/vjjeOedd1BdXQ2g9qnB2dnZmDNnDkaNGtVsxZWXl0MqtSzRyckJJpMJABASEgKNRoPdu3ebtxsMBiQnJyMqKgoAEBUVheLiYqSmpprH7NmzByaTCX369Gm2WuneSSQSLBoRBmcnCfZlXMdPZ/Pv/iYiIrJbTQolH330EUpLS+Hn54ebN2/iscceQ7t27eDp6Yn333+/2YobMWIE3n//ffzwww+4fPky4uPj8fHHH+PPf/4zgNoftenTp+O9997D1q1bcerUKYwbNw5ardY8l0poaCgGDx6MSZMm4ciRIzh48CCmTp2K0aNHN/rOG2o5D7X2wIuPPgQAeHtbOiqq+fgCIiJH1eRbggHgwIEDOHnyJEpLS9G9e3eLu2CaQ0lJCebPn4/4+Hjk5+dDq9VizJgxWLBgAeRyOYBfJ09bs2YNiouL0b9/f3z++efo0KGDeT+FhYWYOnWqxeRpK1asuKfJ03hLcMspr6rBwI/245q+AtOj22N6dIe7v4mIiGxGY39D7yuUOBKGkpa1/WQupm44DoVMip9mPoZAHzexSyIiombSotPMA8Du3buxe/du5Ofnm3s86vzzn/9s6m7JQQ0L98eGh7Nx6FIB3tl+Bl+M63n3NxERkV1pUk/J22+/jUGDBmH37t24ceMGioqKLBaieyWRSPD2450hk0qQcCYPezPY9EpE5GiadKZk9erVWLt2LcaOHdvc9ZADa6/2xIR+bfHFz5l4e2s6+s7whULmJHZZRET0gDTpTElVVRX69u3b3LUQ4dWB7dHaU4HLBeX4x8+ZYpdDREQPUJNCyYsvvsgn7FKL8HRxxptDQwEAn+25gKvFN0WuiIiIHpQmXb6pqKjAmjVr8NNPP6Fr165wdna22P7xxx83S3HkmJ7opsWG5GwcuVyI9384g89je4hdEhERPQBNCiUnT55Et27dAACnT59uznqIapten+iMYSt+xv9O6XDgwg30b99K7LKIiKiFcZ6SRuI8JQ/eoq3pWHvoMh5u7Y4drw2AXNakq41ERCSyFpmn5Mknn7zrGIlEgu++++5edkvUoBn/rwO2ncjFpetlWHsoE5MHPCx2SURE1ILu6f96qlSquy48i0DNReXqjDlDOgEAlv90AXmGCpErIiKilsTLN43EyzfiMJkEjFp9CMezi/FENy2Wj44UuyQiIrpHjf0N5UV6smpSqQTvPN4FEgnwfVouDv9SIHZJRETUQhhKyOqFB6jwXO8gAMDC79NRYzTd5R1ERGSLGErIJrw+qCO83JyRkVeCfyVliV0OERG1AIYSsgne7nL8Laa26fWThPO4XlIpckVERNTcGErIZjzbKxDhbVQoqazBkh3nxC6HiIiaGUMJ2QwnqQTvPNEZAPDdsStIzSoUuSIiImpODCVkUyKDvPFMzwAAwILv02E08Y52IiJ7wVBCNmfO4E5QusiQnmvAhiPZYpdDRETNhKGEbI6vhwKzBnUEAHz4YwYKy6pEroiIiJoDQwnZpNg+QQj1V0J/sxof/MimVyIie8BQQjZJ5iQ1N71uSsnBiZxicQsiIqL7xlBCNqtXWx88GdkGggAs+P40TGx6JSKyaQwlZNPmDukED4UMJ67o8e3RHLHLISKi+8BQQjbNT+mC6dHtAQBLd55DcTmbXomIbBVDCdm88X3booPaA0Xl1fho13mxyyEioiZiKCGb5+wkxaLHa5te1ydn4fRVvcgVERFRUzCUkF3o+3ArDO/qD5MALNyazqZXIiIbZPWh5OrVq/jLX/4CX19fuLq6Ijw8HEePHjVvFwQBCxYsgL+/P1xdXREdHY0LFy5Y7KOwsBCxsbFQKpXw8vLCxIkTUVpa+qAPhVrYm8NC4SZ3QmpWEeKPXxW7HCIiukdWHUqKiorQr18/ODs7Y8eOHThz5gw++ugjeHt7m8csW7YMK1aswOrVq5GcnAx3d3fExMSgoqLCPCY2Nhbp6elISEjA9u3bkZiYiMmTJ4txSNSC/FWumPan2qbXuB3nYKioFrkiIiK6FxJBEKz2PPfcuXNx8OBB/Pzzzw1uFwQBWq0Ws2bNwuuvvw4A0Ov1UKvVWLt2LUaPHo2zZ88iLCwMKSkp6NmzJwBg586dGDp0KK5cuQKtVtuoWgwGA1QqFfR6PZRKZfMcIDW7qhoTBn+aiF9ulOGFfiFYMCJM7JKIiBxeY39DrfpMydatW9GzZ088/fTT8PPzQ2RkJL744gvz9szMTOh0OkRHR5vXqVQq9OnTB0lJSQCApKQkeHl5mQMJAERHR0MqlSI5OfmOn11ZWQmDwWCxkPWTy35tel2XdBnndPzPjYjIVlh1KPnll1+watUqtG/fHj/++CNefvllvPrqq1i3bh0AQKfTAQDUarXF+9RqtXmbTqeDn5+fxXaZTAYfHx/zmIbExcVBpVKZl8DAwOY8NGpBAzq0xuDOGhhNAhZ+nw4rPhlIRES3sepQYjKZ0L17dyxevBiRkZGYPHkyJk2ahNWrV7f4Z8+bNw96vd685ORwtlBb8tbwULg4S5GcWYitJ3LFLoeIiBrBqkOJv78/wsIsewJCQ0ORnZ0NANBoNACAvLw8izF5eXnmbRqNBvn5+Rbba2pqUFhYaB7TEIVCAaVSabGQ7QjwdsOUP7QDACz+31mUVtaIXBEREd2NVYeSfv36ISMjw2Ld+fPnERwcDAAICQmBRqPB7t27zdsNBgOSk5MRFRUFAIiKikJxcTFSU1PNY/bs2QOTyYQ+ffo8gKMgsUwa8BCCfd2QZ6jEZ7sv3P0NREQkKqsOJTNmzMDhw4exePFiXLx4ERs2bMCaNWswZcoUAIBEIsH06dPx3nvvYevWrTh16hTGjRsHrVaLkSNHAqg9szJ48GBMmjQJR44cwcGDBzF16lSMHj260XfekG1ycXbCwlt333x5IBMX8zk3DRGRNbPqUNKrVy/Ex8dj48aN6NKlC9599118+umniI2NNY/529/+hmnTpmHy5Mno1asXSktLsXPnTri4uJjHrF+/Hp06dcLAgQMxdOhQ9O/fH2vWrBHjkOgB+1MnNQZ28kONScCirWx6JSKyZlY9T4k14TwltiuroAz/75NEVNWYsCq2O4aE+4tdEhGRQ7GLeUqImkOwrzteGvAQAODd7WdQXsWmVyIia8RQQg7h5T+0QxsvV+TqK/D53ktil0NERA1gKCGH4Cp3wvzhtU2vaxJ/QeaNMpErIiKi32IoIYcR01mNAR1ao8powrh/JnMKeiIiK8NQQg5DIpHg/ZFdEOjjipzCm3jy80PYceqa2GUREdEtDCXkUAJ93LB1Sn/0a+eL8iojXl5/DB/tyoDJxJvQiIjExlBCDsfbXY51E3rjxf4hAIDP9lzE5H8fRUlFtciVERE5NoYSckgyJyneGh6Gj5+JgFwmxU9n8zFy5UH8cp2zvhIRiYWhhBzak90D8N+XouCvcsGl62V4YuVB7D2Xf/c3EhFRs2MoIYfXNcAL30/th57B3iipqMEL61Lw+b6LnJKeiOgBYyghAuDn6YINkx7BmN5BEARg2c4MTNt4nLO/EhE9QAwlRLfIZVLEPRmO90Z2gUwqwfaT1zBqVRJyCsvFLo2IyCEwlBD9xl8eCcaGSY+glYccZ68Z8MTKg0i6VCB2WUREdo+hhKgBvUN8sHVqf3Rpo0RhWRX+8mUy1h7MZJ8JEVELYighugOtlyv++1JfjOymhdEkYNG2M5jz3UlU1hjFLo2IyC4xlBD9DhdnJ3zybDe8OTQUUgnw7dErePbvh5FnqBC7NCIiu8NQQnQXEokEkwY8hLUTekPl6oy0nGKM+OwAjmUXiV0aEZFdYSghaqQBHVpj69R+6KD2QH5JJUb//TC+TckRuywiIrvBUEJ0D4J93bH5lX6I6axGldGEv313Egu/P41qo0ns0oiIbB5DCdE98lDIsCq2B2ZEdwAArEvKwtgvk1FQWilyZUREto2hhKgJpFIJXotujzVje8Bd7oTDvxTi8f87iPRcvdilERHZLIYSovswqLMG8VP6oa2vG64W38SoVYew7USu2GUREdkkhhKi+9RB7Ynvp/THgA6tUVFtwrSNx7F05zkYTZxojYjoXjCUEDUDlZszvnq+F/464CEAwKp9lzBxXQr0N6tFroyIyHYwlBA1EyepBPOGhmL56G5QyKTYl3EdI1cexMX8ErFLIyKyCQwlRM3siW5t8N3LfaFVuSDzRhlGrjyEn87kiV0WEZHVYyghagFd2qiwdVp/9A7xQWllDSb9+yg+230BJvaZEBHdEUMJUQtp5aHA+hf7YOwjwRAE4KOE85iy4RjKKmvELo2IyCrZVChZsmQJJBIJpk+fbl5XUVGBKVOmwNfXFx4eHhg1ahTy8ixPlWdnZ2PYsGFwc3ODn58fZs+ejZoa/jBQy3N2kuLdkV2w5MlwODtJsOO0DqNWHUJ2QbnYpRERWR2bCSUpKSn4+9//jq5du1qsnzFjBrZt24b//Oc/2L9/P3Jzc/Hkk0+atxuNRgwbNgxVVVU4dOgQ1q1bh7Vr12LBggUP+hDIgY3uHYRNkx9BKw8FzulK8PjKAzhw4YbYZRERWRWbCCWlpaWIjY3FF198AW9vb/N6vV6PL7/8Eh9//DH+9Kc/oUePHvjqq69w6NAhHD58GACwa9cunDlzBl9//TW6deuGIUOG4N1338XKlStRVVUl1iGRA+oR7IPt0/ojIkCF4vJqjPtnMv7x8y8QBPaZEBEBNhJKpkyZgmHDhiE6OtpifWpqKqqrqy3Wd+rUCUFBQUhKSgIAJCUlITw8HGq12jwmJiYGBoMB6enpd/zMyspKGAwGi4XofmlULvjmr1EY1T0AJgF474ezmPXtCVRUG8UujYhIdFYfSjZt2oRjx44hLi6u3jadTge5XA4vLy+L9Wq1Gjqdzjzm9kBSt71u253ExcVBpVKZl8DAwPs8EqJaLs5O+PDprlgwPAxOUgk2H7+KZ/6ehGv6m2KXRkQkKqsOJTk5OXjttdewfv16uLi4PNDPnjdvHvR6vXnJycl5oJ9P9k0ikeCF/iH41wu94eXmjJNX9Bjx2UEcvVwodmlERKKx6lCSmpqK/Px8dO/eHTKZDDKZDPv378eKFSsgk8mgVqtRVVWF4uJii/fl5eVBo9EAADQaTb27cepe141piEKhgFKptFiImlu/dq2wbWp/dNJ44kZpJcZ8cRgbkrPFLouISBRWHUoGDhyIU6dOIS0tzbz07NkTsbGx5r87Oztj9+7d5vdkZGQgOzsbUVFRAICoqCicOnUK+fn55jEJCQlQKpUICwt74MdE9FuBPm7Y/EpfDAv3R7VRwBvxp/Bm/ClU1ZjELo2I6IGSiV3A7/H09ESXLl0s1rm7u8PX19e8fuLEiZg5cyZ8fHygVCoxbdo0REVF4ZFHHgEADBo0CGFhYRg7diyWLVsGnU6Ht956C1OmTIFCoXjgx0TUEDe5DP/3XCTC9inx4a4MrE/Oxvm8Enwe2wOtPfnfUyJyDFZ9pqQxPvnkEwwfPhyjRo3CgAEDoNFosHnzZvN2JycnbN++HU5OToiKisJf/vIXjBs3Du+8846IVRPVJ5FIMOWP7fCPcT3hqZAh5XIRHv+/Azh1RS92aURED4RE4CQJjWIwGKBSqaDX69lfQi3uYn4pJv/rKH65UQaFTIqlo7piZGQbscsiImqSxv6G2vyZEiJ71M7PA/FT+uGPHVujssaE6d+k4f0fzqDGyD4TIrJfDCVEVkrl6ox/jO+FKX98GADwxc+ZmLA2Bem5ehj5tGEiskO8fNNIvHxDYtp+Mhez/3MSN2/N/OrpIkPPYG/0CvFB77Y+CA9QQSFzErlKIqKGNfY3lKGkkRhKSGxncg344MdzOJJZiLIqy2npFTIpugV6oXeID3qH+KB7kDfcFVZ9cx0RORCGkmbGUELWosZowtlrJThyuRApmYU4crkQhWWWD5d0kkrQWatE77Y+6BXig15tfeDjLhepYiJydAwlzYyhhKyVIAi4dL0MKZcLcSSzdrlaXP85Ou38PGrPpNwKKm28XEWologcEUNJM2MoIVtytfim+SxKSmYhLuSX1hvTxsvVfLmnV1sfPNzaHRKJRIRqicjeMZQ0M4YSsmWFZVVIue1yT3quod4dPL7ucvRs643eIb7o3dYHof6ekDnxBj0iun8MJc2MoYTsSWllDY5nF5kv96TlFKPyN8/a8VDI0D3YG73beqNXWx9EBHrBxZl3+BDRvWMoaWYMJWTPKmuMOHVFb77cczSrCCUVNRZj5E5SRASq0Ktt7SWfHsHe8HRxFqliIrIlDCXNjKGEHInRJOCczoCUzEKkXC5CcmYhbpRWWoyRSoBQfyV6tfVBn5Da5tlWHnx4IBHVx1DSzBhKyJEJgoDLBeU4klmAI5lFSLlciOzC8nrjHmrlbm6c7R3igwBvVzbPEhFDSXNjKCGypNNXmC/3pFwuxDldSb0x/ioX9Lp1C3Lvtj5o7+cBqZQhhcjRMJQ0M4YSot9XXF6Fo5drz6IkZxbi9FU9an5zh4+nQoYubVSICPRCt0AVugZ4wV/lwrMpRHaOoaSZMZQQ3ZvyqhqkZRfjyK1J3Y5nF5uf3XO71p4KRAR4ISKgNqxEBHhB5cYGWiJ7wlDSzBhKiO5PjdGEC/mlOJFTjBNXinEiR4+MvJIGn3jc1tfNHFAiAr3QWavk7chENoyhpJkxlBA1v5tVRqTn6nHiit4cVrIK6jfQyqQSdNR43goqtWdU2vt5won9KUQ2gaGkmTGUED0YRWVVOHn1Vki5FVRulFbVG+cmd0IXrQoRgb9e9uHdPkTWiaGkmTGUEIlDEATk6ituu+xTjFNX9Cirqt+f4uMuR0RAbQNtt0AvdA1QwZdzpxCJjqGkmTGUEFkPo0nAL9dLkXYrqJy8osfZawZUG+v/cxbg7Vp7t09AbUjp0kYFd4VMhKqJHBdDSTNjKCGybhXVRpy9ZsCJnNqQknalGL9cL6s3TioBOqg90fW2u306ajzhzIcPErUYhpJmxlBCZHv0N6tx+qq+9ozKrbCiM1TUG6eQSdFZq7w1f4oXugZ4oa2vG/tTiJoJQ0kzYyghsg86fcWtSz61tyWfuFJc7+GDAKByda49m3LbZR9O9EbUNAwlzYyhhMg+mUwCLheUmedOScspxplrBlTVmOqN9XJzRpi/EqH+SoT5KxGmVaKdnwcv/RDdBUNJM2MoIXIcVTUmZOhKkHalGCdvXfa5eL20wYne5E5StFd7WASVUH8lVK6clZaoDkNJM2MoIXJsFdVGXMwvxZlcA85cM+BMrgFnrxlQUln/0g8AtPFyRZj216AS5q/kPCrksBhKmhlDCRH9liAIuFJ0E+m/CSpXi282ON7TRWZxRiXMX4n2ag8oZJxCn+yb3YSSuLg4bN68GefOnYOrqyv69u2LpUuXomPHjuYxFRUVmDVrFjZt2oTKykrExMTg888/h1qtNo/Jzs7Gyy+/jL1798LDwwPjx49HXFwcZLLGzVfAUEJEjaUvr64NKddqQ8qZXAMu5Jc0OI+KTCpBOz8Pi6AS6q+Et7tchMqJWobdhJLBgwdj9OjR6NWrF2pqavDGG2/g9OnTOHPmDNzd3QEAL7/8Mn744QesXbsWKpUKU6dOhVQqxcGDBwEARqMR3bp1g0ajwQcffIBr165h3LhxmDRpEhYvXtyoOhhKiOh+VNWYcOm65eWfM9cM0N+sbnC8v8rFokclzF+JIB83SPm8H7JBdhNKfuv69evw8/PD/v37MWDAAOj1erRu3RobNmzAU089BQA4d+4cQkNDkZSUhEceeQQ7duzA8OHDkZubaz57snr1asyZMwfXr1+HXH73/0fCUEJEzU0QBFzTV1j2qegMDT6UEADc5U61AeW2MyodNZ58gjJZvcb+htrcXMt6vR4A4OPjAwBITU1FdXU1oqOjzWM6deqEoKAgcyhJSkpCeHi4xeWcmJgYvPzyy0hPT0dkZOSDPQgiIgASiQRaL1dovVwRHfbrv08lFdU4pysx96icuWbAOV0JyqqMOJpVhKNZReaxUgnwcGsPi6ASplWiFZ/5QzbIpkKJyWTC9OnT0a9fP3Tp0gUAoNPpIJfL4eXlZTFWrVZDp9OZx9weSOq2121rSGVlJSorK82vDQZDcx0GEdHv8nRxRq+2PujV1se8rsZowi83ysw9KnVnVgrKqnAhvxQX8kvxfVquebyfp8J86SeklTuCfdzQtpU7/DwVvAOIrJZNhZIpU6bg9OnTOHDgQIt/VlxcHN5+++0W/xwiosaQOUnRQe2JDmpPPNGtDYDayz/XSyqRfltQOXvNgMwbZcgvqUR+xnXsy7husR8XZymCfdwR7Ot2a6n9e1tfd/irXCDjRHAkIpsJJVOnTsX27duRmJiIgIAA83qNRoOqqioUFxdbnC3Jy8uDRqMxjzly5IjF/vLy8szbGjJv3jzMnDnT/NpgMCAwMLC5DoeI6L5JJBL4KV3gp3TBHzv6mdeXV9WYL/9k6EpwuaAM2YXluFJ0ExXVJmTklSAjr6Te/mRSCQJ9boUVn18DS7CvOwJ9XHnrMrU4qw8lgiBg2rRpiI+Px759+xASEmKxvUePHnB2dsbu3bsxatQoAEBGRgays7MRFRUFAIiKisL777+P/Px8+PnV/g83ISEBSqUSYWFhDX6uQqGAQsFrskRke9zkMnQP8kb3IG+L9dVGE64W3URWYTmyCspw+UY5sgvLcLmgHNmF5aiqMSHzRhkyb9R/urJEAmhVrhZnWNr6uiHo1lkXd4XV/5yQDbD6u29eeeUVbNiwAd9//73F3CQqlQqurq4Aam8J/t///oe1a9dCqVRi2rRpAIBDhw4B+PWWYK1Wi2XLlkGn02Hs2LF48cUXeUswERFqnwGkM1TUnlUpKL8VVGqDS1ZBGcqqjL/7/lYeitqQcutS0O3BxcuNc644Oru5JfhODVlfffUVnn/+eQC/Tp62ceNGi8nTbr80k5WVhZdffhn79u2Du7s7xo8fjyVLlnDyNCKiuxAEAQVlVcgqKEPWrcBS9/esgjIUlTc810odpYsMbVu5114O8rE809KajbcOwW5CibVgKCEiapj+ZvWtsyu1vSuXb9wKLIVlyDNU/u57XZ2d6jXd1jXiar1c4cTJ4uwCQ0kzYyghIrp3N6uMtUGloOy2syu1geVq0U008OBlM5lUArXSBX5KBTRKF6iVLtCoXKBWKmr/fmsd+1msn91OnkZERLbDVe6EjhpPdNR41ttWVWPClaLy2sbbG2W3GnBrA8yVwpuoMppwtfjmHR9wWMdTIYP6N2FFo3KBn2ftnxqlC1p5yHm7sw1gKCEiIlHIZVI81NoDD7X2ADpabjOaBOQZKqAzVCBPX3Hr75W1f+orkFdSu76syoiSyhqU5JfiYn7pHT9LKqltxv01rPx69uXXMzAuULrI2OMiIoYSIiKyOk7SX6fg/z0lFdXIayCs6AwV5vX5JZUwmoTaCeVKKgHo77g/V2cn8xmX28OK+rZLSH5KBedsaSEMJUREZLM8XZzh6eKMdn4edxxjNAkoKKtEnr7yVlipMIcYnaEC+Yba9fqb1bhZbcTlW3cY/R4fd/mtS0W/DTC/vvZ2k7NR9x4xlBARkV1zkkrg51l72SYcqjuOu1llRH5J/bBS+/dfz75U1ZhQWFaFwrIqnL1258+VSAAvV2d4u8vh6y6Ht5scvh61f/q4N7y4yR37Z9mxj56IiOgWV7nTrduS3e84RhAEFJdXW4aVW2dgfg0uFbhRWgVBAIrKq1FUXo1frtefJbchLs5S+LjJ4XMrvPi6y+HjroCPu3ODf6pcne3qbAxDCRERUSNJJBJ4u8vh7S5HqP+db22tMZpuBZIqFJRW1f5ZVoWiW2dY6pbb11UZTaioNiFXX4FcfUUj6wG83eTwdnOGr7sC3ncIL3VBx8dNDle59fbDMJQQERE1M5mTFK09FWjtqQDUdx8vCALKqowoLK1CYXkVCssqUVhW3eCfReXVKCithKGiBoIAc8C51MizMa7OTubLRXe6tNRZq0Sgj9t9fgv3jqGEiIhIZBKJBB4KGTwUMgT5Ni4MVBtNKCqvQlFZNQrKKlH02xBT/ttQU4Vqo4Cb1ca7zv/yxtBOmDzg4eY6vEZjKCEiIrJBzk5ScwMvUH9yut8SBAGllTW/hpjfubQU5HPnvpqWxFBCRETkACQSifkW6saejXnQOOcuERERWQWGEiIiIrIKDCVERERkFRhKiIiIyCowlBAREZFVYCghIiIiq8BQQkRERFaBoYSIiIisAkMJERERWQWGEiIiIrIKDCVERERkFfjsm0YSBAEAYDAYRK6EiIjIttT9dtb9lt4JQ0kjlZSUAAACAwNFroSIiMg2lZSUQKVS3XG7RLhbbCEAgMlkQm5uLjw9PSGRSMQuR3QGgwGBgYHIycmBUqkUuxyHwe9dHPzeHzx+5+Joqe9dEASUlJRAq9VCKr1z5wjPlDSSVCpFQECA2GVYHaVSyX8wRMDvXRz83h88fufiaInv/ffOkNRhoysRERFZBYYSIiIisgoMJdQkCoUCCxcuhEKhELsUh8LvXRz83h88fufiEPt7Z6MrERERWQWeKSEiIiKrwFBCREREVoGhhIiIiKwCQwkRERFZBYYSarS4uDj06tULnp6e8PPzw8iRI5GRkSF2WQ5nyZIlkEgkmD59util2L2rV6/iL3/5C3x9feHq6orw8HAcPXpU7LLsmtFoxPz58xESEgJXV1c8/PDDePfdd+/6zBS6N4mJiRgxYgS0Wi0kEgm2bNlisV0QBCxYsAD+/v5wdXVFdHQ0Lly40OJ1MZRQo+3fvx9TpkzB4cOHkZCQgOrqagwaNAhlZWVil+YwUlJS8Pe//x1du3YVuxS7V1RUhH79+sHZ2Rk7duzAmTNn8NFHH8Hb21vs0uza0qVLsWrVKvzf//0fzp49i6VLl2LZsmX47LPPxC7NrpSVlSEiIgIrV65scPuyZcuwYsUKrF69GsnJyXB3d0dMTAwqKipatC7eEkxNdv36dfj5+WH//v0YMGCA2OXYvdLSUnTv3h2ff/453nvvPXTr1g2ffvqp2GXZrblz5+LgwYP4+eefxS7FoQwfPhxqtRpffvmled2oUaPg6uqKr7/+WsTK7JdEIkF8fDxGjhwJoPYsiVarxaxZs/D6668DAPR6PdRqNdauXYvRo0e3WC08U0JNptfrAQA+Pj4iV+IYpkyZgmHDhiE6OlrsUhzC1q1b0bNnTzz99NPw8/NDZGQkvvjiC7HLsnt9+/bF7t27cf78eQDAiRMncODAAQwZMkTkyhxHZmYmdDqdxb81KpUKffr0QVJSUot+Nh/IR01iMpkwffp09OvXD126dBG7HLu3adMmHDt2DCkpKWKX4jB++eUXrFq1CjNnzsQbb7yBlJQUvPrqq5DL5Rg/frzY5dmtuXPnwmAwoFOnTnBycoLRaMT777+P2NhYsUtzGDqdDgCgVqst1qvVavO2lsJQQk0yZcoUnD59GgcOHBC7FLuXk5OD1157DQkJCXBxcRG7HIdhMpnQs2dPLF68GAAQGRmJ06dPY/Xq1QwlLejbb7/F+vXrsWHDBnTu3BlpaWmYPn06tFotv3cHwMs3dM+mTp2K7du3Y+/evQgICBC7HLuXmpqK/Px8dO/eHTKZDDKZDPv378eKFSsgk8lgNBrFLtEu+fv7IywszGJdaGgosrOzRarIMcyePRtz587F6NGjER4ejrFjx2LGjBmIi4sTuzSHodFoAAB5eXkW6/Py8szbWgpDCTWaIAiYOnUq4uPjsWfPHoSEhIhdkkMYOHAgTp06hbS0NPPSs2dPxMbGIi0tDU5OTmKXaJf69etX75b38+fPIzg4WKSKHEN5eTmkUsufJicnJ5hMJpEqcjwhISHQaDTYvXu3eZ3BYEBycjKioqJa9LN5+YYabcqUKdiwYQO+//57eHp6mq8tqlQquLq6ilyd/fL09KzXt+Pu7g5fX1/287SgGTNmoG/fvli8eDGeeeYZHDlyBGvWrMGaNWvELs2ujRgxAu+//z6CgoLQuXNnHD9+HB9//DFeeOEFsUuzK6Wlpbh48aL5dWZmJtLS0uDj44OgoCBMnz4d7733Htq3b4+QkBDMnz8fWq3WfIdOixGIGglAg8tXX30ldmkO57HHHhNee+01scuwe9u2bRO6dOkiKBQKoVOnTsKaNWvELsnuGQwG4bXXXhOCgoIEFxcX4aGHHhLefPNNobKyUuzS7MrevXsb/Pd8/PjxgiAIgslkEubPny+o1WpBoVAIAwcOFDIyMlq8Ls5TQkRERFaBPSVERERkFRhKiIiIyCowlBAREZFVYCghIiIiq8BQQkRERFaBoYSIiIisAkMJERERWQWGEiK6b5cvX4ZEIkFaWprYpZidO3cOjzzyCFxcXNCtW7d7fr81HhORvWMoIbIDzz//PCQSCZYsWWKxfsuWLZBIJCJVJa6FCxfC3d0dGRkZFs/wEMvatWvh5eUldhlEVo2hhMhOuLi4YOnSpSgqKhK7lGZTVVXV5PdeunQJ/fv3R3BwMHx9fZuxKnEZjUY+nI7sFkMJkZ2Ijo6GRqP53Ue8L1q0qN6ljE8//RRt27Y1v37++ecxcuRILF68GGq1Gl5eXnjnnXdQU1OD2bNnw8fHBwEBAfjqq6/q7f/cuXPo27cvXFxc0KVLF+zfv99i++nTpzFkyBB4eHhArVZj7NixuHHjhnn7H/7wB0ydOhXTp09Hq1atEBMT0+BxmEwmvPPOOwgICIBCoUC3bt2wc+dO83aJRILU1FS88847kEgkWLRo0R33s2zZMrRr1w4KhQJBQUF4//33Gxzb0JmO356JOnHiBP74xz/C09MTSqUSPXr0wNGjR7Fv3z5MmDABer0eEonEoqbKykq8/vrraNOmDdzd3dGnTx/s27ev3udu3boVYWFhUCgUyM7Oxr59+9C7d2+4u7vDy8sL/fr1Q1ZWVoO1E9kKhhIiO+Hk5ITFixfjs88+w5UrV+5rX3v27EFubi4SExPx8ccfY+HChRg+fDi8vb2RnJyMl156CX/961/rfc7s2bMxa9YsHD9+HFFRURgxYgQKCgoAAMXFxfjTn/6EyMhIHD16FDt37kReXh6eeeYZi32sW7cOcrkcBw8exOrVqxusb/ny5fjoo4/w4Ycf4uTJk4iJicHjjz+OCxcuAACuXbuGzp07Y9asWbh27Rpef/31Bvczb948LFmyBPPnz8eZM2ewYcMGqNXqJn9vsbGxCAgIQEpKClJTUzF37lw4Ozujb9+++PTTT6FUKnHt2jWLmqZOnYqkpCRs2rQJJ0+exNNPP43BgwebjwUAysvLsXTpUvzjH/9Aeno6fHx8MHLkSDz22GM4efIkkpKSMHnyZIe9VEd2pMUf+UdELW78+PHCE088IQiCIDzyyCPCCy+8IAiCIMTHxwu3/8984cKFQkREhMV7P/nkEyE4ONhiX8HBwYLRaDSv69ixo/Doo4+aX9fU1Aju7u7Cxo0bBUEQhMzMTAGAsGTJEvOY6upqISAgQFi6dKkgCILw7rvvCoMGDbL47JycHAGA+emjjz32mBAZGXnX49VqtcL7779vsa5Xr17CK6+8Yn4dEREhLFy48I77MBgMgkKhEL744osGt9cd0/HjxwVBEISvvvpKUKlUFmN++/16enoKa9eubXB/Db0/KytLcHJyEq5evWqxfuDAgcK8efPM7wMgpKWlmbcXFBQIAIR9+/bd8fiIbBHPlBDZmaVLl2LdunU4e/Zsk/fRuXNnSKW//vOgVqsRHh5ufu3k5ARfX1/k5+dbvC8qKsr8d5lMhp49e5rrOHHiBPbu3QsPDw/z0qlTJwC1/R91evTo8bu1GQwG5Obmol+/fhbr+/Xrd0/HfPbsWVRWVmLgwIGNfs/dzJw5Ey+++CKio6OxZMkSi+NqyKlTp2A0GtGhQweL72X//v0W75XL5ejatav5tY+PD55//nnExMRgxIgRWL58Oa5du9Zsx0EkFoYSIjszYMAAxMTEYN68efW2SaVSCIJgsa66urreOGdnZ4vXEomkwXX30nBZWlqKESNGIC0tzWK5cOECBgwYYB7n7u7e6H3eD1dX13sa35jvbtGiRUhPT8ewYcOwZ88ehIWFIT4+/o77LC0thZOTE1JTUy2+k7Nnz2L58uUWtf720sxXX32FpKQk9O3bF9988w06dOiAw4cP39MxEVkbhhIiO7RkyRJs27YNSUlJFutbt24NnU5n8ePanPNw3P6jWFNTg9TUVISGhgIAunfvjvT0dLRt2xbt2rWzWO4liCiVSmi1Whw8eNBi/cGDBxEWFtbo/bRv3x6urq6Nvl24devWKCkpQVlZmXldQ99dhw4dMGPGDOzatQtPPvmkuSFYLpfDaDRajI2MjITRaER+fn6970Sj0dy1psjISMybNw+HDh1Cly5dsGHDhkYdC5G1YighskPh4eGIjY3FihUrLNb/4Q9/wPXr17Fs2TJcunQJK1euxI4dO5rtc1euXIn4+HicO3cOU6ZMQVFREV544QUAwJQpU1BYWIgxY8YgJSUFly5dwo8//ogJEybU+7G+m9mzZ2Pp0qX45ptvkJGRgblz5yItLQ2vvfZao/fh4uKCOXPm4G9/+xv+9a9/4dKlSzh8+DC+/PLLBsf36dMHbm5ueOONN3Dp0iVs2LABa9euNW+/efMmpk6din379iErKwsHDx5ESkqKOZS1bdsWpaWl2L17N27cuIHy8nJ06NABsbGxGDduHDZv3ozMzEwcOXIEcXFx+OGHH+5Ye2ZmJubNm4ekpCRkZWVh165duHDhgvmziGwVQwmRnXrnnXfqXV4JDQ3F559/jpUrVyIiIgJHjhy5450pTbFkyRIsWbIEEREROHDgALZu3YpWrVoBgPnshtFoxKBBgxAeHo7p06fDy8vLon+lMV599VXMnDkTs2bNQnh4OHbu3ImtW7eiffv297Sf+fPnY9asWViwYAFCQ0Px7LPP1uuTqePj44Ovv/4a//vf/xAeHo6NGzda3Grs5OSEgoICjBs3Dh06dMAzzzyDIUOG4O233wYA9O3bFy+99BKeffZZtG7dGsuWLQNQexlm3LhxmDVrFjp27IiRI0ciJSUFQUFBd6zbzc0N586dw6hRo9ChQwdMnjwZU6ZMwV//+td7On4iayMRfnuRlIiIiEgEPFNCREREVoGhhIiIiKwCQwkRERFZBYYSIiIisgoMJURERGQVGEqIiIjIKjCUEBERkVVgKCEiIiKrwFBCREREVoGhhIiIiKwCQwkRERFZBYYSIiIisgr/H24D6wEdzeboAAAAAElFTkSuQmCC",
+      "text/plain": [
+       "<Figure size 600x400 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df2 = pd.DataFrame(np.random.uniform(0, 10, (100, 2)))\n",
+    "\n",
+    "s = pd.Series(dtype=float)\n",
+    "\n",
+    "for num_clusters in range(1, 11):\n",
+    "    km = KMeans(num_clusters, n_init = 320)\n",
+    "    km.fit(df2)\n",
+    "    s.at[num_clusters] = km.inertia_\n",
+    "\n",
+    "ax = s.plot.line(figsize=(6, 4))\n",
+    "ax.set_ylabel(\"Inertia\")\n",
+    "ax.set_xlabel(\"Number of clusters\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6decdab-74a3-45b5-a408-e7b54bd992d9",
+   "metadata": {},
+   "source": [
+    "**Observation**: there is an \"elbow\" around `n_clusters`=3."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c6115b0-61df-4660-8355-3cd56bd94080",
+   "metadata": {},
+   "source": [
+    "#### Will we always have a clear \"elbow\"?\n",
+    "\n",
+    "- Let's generate uniform random data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "424743f8-41de-42b8-ab78-07901682ee84",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Axes: xlabel='0', ylabel='1'>"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df2 = pd.DataFrame(np.random.uniform(0, 10, (100, 2)))\n",
+    "df2.plot.scatter(0, 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "fba71303-d4c6-46d3-ae8e-f0e863d8e032",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Text(0.5, 0, 'Number of clusters')"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 600x400 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "s = pd.Series(dtype=float)\n",
+    "\n",
+    "for num_clusters in range(1, 11):\n",
+    "    km = KMeans(num_clusters, n_init = 320)\n",
+    "    km.fit(df2)\n",
+    "    s.at[num_clusters] = km.inertia_\n",
+    "\n",
+    "ax = s.plot.line(figsize=(6, 4))\n",
+    "ax.set_ylabel(\"Inertia\")\n",
+    "ax.set_xlabel(\"Number of clusters\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8af299e7",
+   "metadata": {},
+   "source": [
+    "### K-Means use cases:\n",
+    "\n",
+    "1. estimator\n",
+    "2. transformer:\n",
+    "    - sometimes we'll use an unsupervised learning technique (like k-means) to pre-process data, creating better inputs for a supervised learning technique (like logistic regression)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "6b99861d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_data():\n",
+    "    x, y = datasets.make_blobs(n_samples=250, centers=5, random_state=5)\n",
+    "    xcols = [\"x0\", \"x1\"]\n",
+    "    df1 = pd.DataFrame(x, columns=xcols)\n",
+    "    df1[\"y\"] = y > 0\n",
+    "\n",
+    "    df2 = pd.DataFrame(np.random.uniform(-10, 10, size=(250, 2)), columns=[\"x0\", \"x1\"])\n",
+    "    df2[\"y\"] = False\n",
+    "\n",
+    "    return pd.concat((df1, df2))\n",
+    "\n",
+    "train, test = train_test_split(make_data())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "c1a0353f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 1000x400 with 2 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "plt.rcParams[\"font.size\"] = 16\n",
+    "fig, ax = plt.subplots(ncols=2, figsize=(10,4))\n",
+    "train.plot.scatter(x=\"x0\", y=\"x1\", c=train[\"y\"], vmin=-1, ax=ax[0])\n",
+    "test.plot.scatter(x=\"x0\", y=\"x1\", c=\"red\", ax=ax[1])\n",
+    "ax[0].set_title(\"Training Data\")\n",
+    "ax[1].set_title(\"Test Data\")\n",
+    "plt.subplots_adjust(wspace=0.4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "57800660",
+   "metadata": {},
+   "source": [
+    "#### Objective: use `LogisticRegression` to classify points as \"black\" or \"gray\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "cba5b0b6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/gurmail.singh/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
+      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
+      "\n",
+      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
+      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
+      "Please also refer to the documentation for alternative solver options:\n",
+      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
+      "  n_iter_i = _check_optimize_result(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.768"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = Pipeline([\n",
+    "    (\"km\", KMeans(10, n_init = 320)),\n",
+    "    (\"lr\", LogisticRegression()),\n",
+    "])\n",
+    "model.fit(train[[\"x0\", \"x1\"]], train[\"y\"])\n",
+    "model.score(test[[\"x0\", \"x1\"]], test[\"y\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "e78a788c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.784"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = Pipeline([\n",
+    "    (\"km\", KMeans(10, n_init = 320)),\n",
+    "    (\"std\", StandardScaler()),\n",
+    "    (\"lr\", LogisticRegression()),\n",
+    "])\n",
+    "model.fit(train[[\"x0\", \"x1\"]], train[\"y\"])\n",
+    "model.score(test[[\"x0\", \"x1\"]], test[\"y\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0d506007",
+   "metadata": {},
+   "source": [
+    "### `StandardScaler` with `KMeans`\n",
+    "\n",
+    "Recall that `StandardScaler` should always be applied after applying `PolynomialFeatures` (from last lecture)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "1229aad1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Axes: xlabel='0', ylabel='1'>"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 600x400 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "x = datasets.make_blobs(centers=np.array([(0, 0), (0, 20), (3, 20)]))[0]\n",
+    "df = pd.DataFrame(x)\n",
+    "df.plot.scatter(x=0, y=1, figsize=(6, 4))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "9f21a66d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,\n",
+       "       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,\n",
+       "       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,\n",
+       "       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,\n",
+       "       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0], dtype=int32)"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "km_c = KMeans(2, n_init = 320)\n",
+    "km_c.fit(df)\n",
+    "km_c.predict(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7dc700d4",
+   "metadata": {},
+   "source": [
+    "#### `fit_predict(...)` is a shortcut for `fit` and `predict` method invocations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "7bc1d18d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,\n",
+       "       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,\n",
+       "       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,\n",
+       "       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,\n",
+       "       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0], dtype=int32)"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "KMeans(2, n_init = 320).fit_predict(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "b2dfa6bd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Axes: xlabel='0', ylabel='1'>"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 600x400 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# -1 => white, 0 => gray, 1 => black\n",
+    "df.plot.scatter(x=0, y=1, figsize=(6, 4), c=KMeans(2, n_init = 320).fit_predict(df), vmin=-1, vmax=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "762f2882",
+   "metadata": {},
+   "source": [
+    "**Observation**: scale for columns are intentionally not specified."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "4f99dfeb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>3.244656</td>\n",
+       "      <td>17.882965</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.519742</td>\n",
+       "      <td>-1.214826</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2.055869</td>\n",
+       "      <td>19.481308</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.863258</td>\n",
+       "      <td>19.307939</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>-0.270818</td>\n",
+       "      <td>19.559756</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>95</th>\n",
+       "      <td>-1.352074</td>\n",
+       "      <td>20.421015</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>96</th>\n",
+       "      <td>0.572919</td>\n",
+       "      <td>0.985182</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>97</th>\n",
+       "      <td>3.724053</td>\n",
+       "      <td>20.445860</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>98</th>\n",
+       "      <td>1.663830</td>\n",
+       "      <td>20.743809</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>99</th>\n",
+       "      <td>2.756283</td>\n",
+       "      <td>19.533742</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>100 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           0          1\n",
+       "0   3.244656  17.882965\n",
+       "1   0.519742  -1.214826\n",
+       "2   2.055869  19.481308\n",
+       "3   0.863258  19.307939\n",
+       "4  -0.270818  19.559756\n",
+       "..       ...        ...\n",
+       "95 -1.352074  20.421015\n",
+       "96  0.572919   0.985182\n",
+       "97  3.724053  20.445860\n",
+       "98  1.663830  20.743809\n",
+       "99  2.756283  19.533742\n",
+       "\n",
+       "[100 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d444185c",
+   "metadata": {},
+   "source": [
+    "Let's make a copy of the data. Assuming initial data for both columns is in \"km\", let's convert one column (`0`) into \"meters\". "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "2e437218",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>3244.655538</td>\n",
+       "      <td>17.882965</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>519.741624</td>\n",
+       "      <td>-1.214826</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2055.868744</td>\n",
+       "      <td>19.481308</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>863.258081</td>\n",
+       "      <td>19.307939</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>-270.817705</td>\n",
+       "      <td>19.559756</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             0          1\n",
+       "0  3244.655538  17.882965\n",
+       "1   519.741624  -1.214826\n",
+       "2  2055.868744  19.481308\n",
+       "3   863.258081  19.307939\n",
+       "4  -270.817705  19.559756"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df2 = df.copy()\n",
+    "df2[0] *= 1000 # km => m\n",
+    "df2.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "c99315a5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Axes: xlabel='0', ylabel='1'>"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 600x400 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df2.plot.scatter(x=0, y=1, figsize=(6,4), c=KMeans(2, n_init = 320).fit_predict(df2), vmin=-1, vmax=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3966ddd3",
+   "metadata": {},
+   "source": [
+    "**Observations**:\n",
+    "- One would expect to see the same clusters, but that is not happening here. Why?\n",
+    "    - x-axis difference is too high when compared to the y-axis difference\n",
+    "    - That is, KMeans doesn't get that x-axis has scaled data, whereas y-axis doesn't have scaled data\n",
+    "- This is not too far off from realistic datasets. \n",
+    "    - That is, real-world dataset columns might have difference units. \n",
+    "    - For example, one column might be representing temperature data where as another might be representing distance."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b63c9d9b",
+   "metadata": {},
+   "source": [
+    "#### Conclusion: `StandardScaler` should be applied before `KMeans`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "2c81ed04",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Axes: xlabel='0', ylabel='1'>"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 600x400 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "model = Pipeline([\n",
+    "    (\"std\", StandardScaler()),\n",
+    "    (\"km\", KMeans(2, n_init = 320)),\n",
+    "])\n",
+    "\n",
+    "df2.plot.scatter(x=0, y=1, figsize=(6, 4), c=model.fit_predict(df2), vmin=-1, vmax=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "359dd107",
+   "metadata": {},
+   "source": [
+    "### Wisconsin counties example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "8847306e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>NAME</th>\n",
+       "      <th>POP100</th>\n",
+       "      <th>AREALAND</th>\n",
+       "      <th>HU100</th>\n",
+       "      <th>developed</th>\n",
+       "      <th>forest</th>\n",
+       "      <th>pasture</th>\n",
+       "      <th>crops</th>\n",
+       "      <th>geometry</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Racine County</td>\n",
+       "      <td>195408</td>\n",
+       "      <td>861533739</td>\n",
+       "      <td>82164</td>\n",
+       "      <td>0.230906</td>\n",
+       "      <td>0.100167</td>\n",
+       "      <td>0.072588</td>\n",
+       "      <td>0.482126</td>\n",
+       "      <td>POLYGON ((645313.81834 2212738.58489, 645456.3...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Clark County</td>\n",
+       "      <td>34690</td>\n",
+       "      <td>3133378070</td>\n",
+       "      <td>15076</td>\n",
+       "      <td>0.046476</td>\n",
+       "      <td>0.326691</td>\n",
+       "      <td>0.022979</td>\n",
+       "      <td>0.444642</td>\n",
+       "      <td>POLYGON ((431909.29098 2393751.35940, 433872.5...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Wood County</td>\n",
+       "      <td>74749</td>\n",
+       "      <td>2054044751</td>\n",
+       "      <td>34088</td>\n",
+       "      <td>0.080285</td>\n",
+       "      <td>0.226244</td>\n",
+       "      <td>0.023411</td>\n",
+       "      <td>0.320990</td>\n",
+       "      <td>POLYGON ((498653.94690 2388370.84202, 498647.3...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Rusk County</td>\n",
+       "      <td>14755</td>\n",
+       "      <td>2366092584</td>\n",
+       "      <td>8883</td>\n",
+       "      <td>0.035567</td>\n",
+       "      <td>0.473937</td>\n",
+       "      <td>0.049572</td>\n",
+       "      <td>0.138357</td>\n",
+       "      <td>POLYGON ((397166.23292 2498521.78567, 397167.7...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Ozaukee County</td>\n",
+       "      <td>86395</td>\n",
+       "      <td>603514413</td>\n",
+       "      <td>36267</td>\n",
+       "      <td>0.222642</td>\n",
+       "      <td>0.088609</td>\n",
+       "      <td>0.127867</td>\n",
+       "      <td>0.389109</td>\n",
+       "      <td>POLYGON ((654796.85595 2272096.94081, 654799.8...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             NAME  POP100    AREALAND  HU100  developed    forest   pasture  \\\n",
+       "0   Racine County  195408   861533739  82164   0.230906  0.100167  0.072588   \n",
+       "1    Clark County   34690  3133378070  15076   0.046476  0.326691  0.022979   \n",
+       "2     Wood County   74749  2054044751  34088   0.080285  0.226244  0.023411   \n",
+       "3     Rusk County   14755  2366092584   8883   0.035567  0.473937  0.049572   \n",
+       "4  Ozaukee County   86395   603514413  36267   0.222642  0.088609  0.127867   \n",
+       "\n",
+       "      crops                                           geometry  \n",
+       "0  0.482126  POLYGON ((645313.81834 2212738.58489, 645456.3...  \n",
+       "1  0.444642  POLYGON ((431909.29098 2393751.35940, 433872.5...  \n",
+       "2  0.320990  POLYGON ((498653.94690 2388370.84202, 498647.3...  \n",
+       "3  0.138357  POLYGON ((397166.23292 2498521.78567, 397167.7...  \n",
+       "4  0.389109  POLYGON ((654796.85595 2272096.94081, 654799.8...  "
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = gpd.read_file(\"counties.geojson\")\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "377e9bb0",
+   "metadata": {},
+   "source": [
+    "#### If we want to use \"POP100\", \"AREALAND\", \"developed\", \"forest\", \"pasture\", \"crops\" for clustering, what transformer should we use? \n",
+    "\n",
+    "- StandardScaler."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15f0b21c",
+   "metadata": {},
+   "source": [
+    "### Goal here: cluster counties based on similar land usage."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "55013d0a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Axes: >"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df.plot()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "a199a2af",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Axes: >"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df.plot(column=\"crops\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "8735a7bb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Axes: >"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df.plot(column=\"forest\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a42394cb",
+   "metadata": {},
+   "source": [
+    "### KMeans"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "a8b3c831",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.290908515122804\n",
+      "[3 0 0 2 3 0 3 1 2 0 3 3 3 2 0 1 0 0 2 2 0 2 0 3 0 0 0 0 2 3 3 3 2 0 0 0 3\n",
+      " 0 2 3 3 0 0 3 2 2 2 3 0 0 0 3 2 2 2 3 0 0 3 3 2 3 2 2 3 0 2 2 0 2 2 0]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<Axes: >"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "xcols = [\"developed\", \"forest\", \"pasture\", \"crops\"]\n",
+    "\n",
+    "# instantiate\n",
+    "km_c = KMeans(4, n_init = 320)\n",
+    "# fit\n",
+    "km_c.fit(df[xcols])\n",
+    "# predict\n",
+    "clusters = km_c.predict(df[xcols])\n",
+    "\n",
+    "print(km_c.inertia_)\n",
+    "print(clusters)\n",
+    "\n",
+    "df.plot(column=clusters, cmap=\"tab10\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "01189d61",
+   "metadata": {},
+   "source": [
+    "**Observation**: cluster number can be random. That is, if you re-run the above cell twice, you will get different number for each cluster."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1764b5a6",
+   "metadata": {},
+   "source": [
+    "### Agglomerative clustering\n",
+    "\n",
+    "- import statement\n",
+    "```python\n",
+    "from sklearn.cluster import AgglomerativeClustering\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "3d7954a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'AgglomerativeClustering' object has no attribute 'predict'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[36], line 8\u001b[0m\n\u001b[1;32m      6\u001b[0m km_c\u001b[38;5;241m.\u001b[39mfit(df[xcols])\n\u001b[1;32m      7\u001b[0m \u001b[38;5;66;03m# predict\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m clusters \u001b[38;5;241m=\u001b[39m \u001b[43mkm_c\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m(df[xcols])\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28mprint\u001b[39m(km_c\u001b[38;5;241m.\u001b[39minertia_)\n\u001b[1;32m     11\u001b[0m \u001b[38;5;28mprint\u001b[39m(clusters)\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'AgglomerativeClustering' object has no attribute 'predict'"
+     ]
+    }
+   ],
+   "source": [
+    "xcols = [\"developed\", \"forest\", \"pasture\", \"crops\"]\n",
+    "\n",
+    "# instantiate\n",
+    "km_c = AgglomerativeClustering(4)\n",
+    "# fit\n",
+    "km_c.fit(df[xcols])\n",
+    "# predict\n",
+    "clusters = km_c.predict(df[xcols])\n",
+    "\n",
+    "print(km_c.inertia_)\n",
+    "print(clusters)\n",
+    "\n",
+    "df.plot(column=clusters, cmap=\"tab10\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6f825891",
+   "metadata": {},
+   "source": [
+    "**Observations**: \n",
+    "- no centroids => no inertia => no elbow plots (how do we pick cluster count?):\n",
+    "    - AttributeError: 'AgglomerativeClustering' object has no attribute 'predict'\n",
+    "- no `predict` method, but there is `fit_predict`:\n",
+    "    - AttributeError: 'AgglomerativeClustering' object has no attribute 'predict'\n",
+    "    - why?\n",
+    "        - because each point could lead to a completely different tree\n",
+    "        - remember unlike KMeans (which is top-down), AgglomerativeClustering is bottom-up"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e16c6131",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xcols = [\"developed\", \"forest\", \"pasture\", \"crops\"]\n",
+    "\n",
+    "# instantiate\n",
+    "km_c = AgglomerativeClustering(4)\n",
+    "# fit_predict\n",
+    "clusters = km_c.fit_predict(df[xcols])\n",
+    "\n",
+    "# print(km_c.inertia_)\n",
+    "print(clusters)\n",
+    "\n",
+    "df.plot(column=clusters, cmap=\"tab10\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fc2bf08-a0e3-49a3-b089-af0f99f83613",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/lecture_material/24-clustering/24-clustering_001.ipynb b/lecture_material/24-clustering/24-clustering_001.ipynb
new file mode 100644
index 0000000..1550e87
--- /dev/null
+++ b/lecture_material/24-clustering/24-clustering_001.ipynb
@@ -0,0 +1,916 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "035e9e2c-9781-4b9c-8395-be9e55e4e082",
+   "metadata": {},
+   "source": [
+    "# Clustering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cbd48a28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import geopandas as gpd\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "from sklearn import datasets\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.linear_model import LinearRegression, LogisticRegression\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.preprocessing import PolynomialFeatures\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "\n",
+    "# new import statements\n",
+    "from sklearn.cluster import KMeans, AgglomerativeClustering"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e72ea2f4",
+   "metadata": {},
+   "source": [
+    "# Unsupervised Machine Learning: Clustering\n",
+    "\n",
+    "- In classification (supervised), we try to find boundaries/rules to separate points according to pre-determined labels.\n",
+    "- In clustering, the algorithm chooses the labels.  Goal is to choose labels so that similar rows get labeled the same.\n",
+    "\n",
+    "### K-Means Clustering\n",
+    "\n",
+    "- K: number of clusters:\n",
+    "    - 3-Means => 3 clusters\n",
+    "    - 4-Means => 4 clusters, and so on\n",
+    "- Means: we will find centroids (aka means aka averages) to create clusters\n",
+    "\n",
+    "- import statement:\n",
+    "```python\n",
+    "from sklearn.cluster import KMeans\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a0ad5a5",
+   "metadata": {},
+   "source": [
+    "#### Iterative algorithm for K-Means"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b83aaf3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate random data\n",
+    "x, y = datasets.make_blobs(n_samples=100, centers=3, cluster_std=1.2, random_state=3)\n",
+    "df = pd.DataFrame(x, columns=[\"x0\", \"x1\"])\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fbced908",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def km_scatter(df, **kwargs):\n",
+    "    \"\"\"\n",
+    "    Produces scatter plot visualizations with x0 on x-axis and y0 on y-axis.\n",
+    "    It can also plot the centroids for clusters.\n",
+    "    Parameters:\n",
+    "        x0 => x-axis\n",
+    "        x1 => y-axis\n",
+    "        cluster => marker type\n",
+    "    \"\"\"\n",
+    "    ax = kwargs.pop(\"ax\", None)\n",
+    "    if not \"label\" in df.columns:\n",
+    "        return df.plot.scatter(x=\"x0\", y=\"x1\", marker=\"$?$\", ax=ax, **kwargs)\n",
+    "\n",
+    "    for marker in set(df[\"label\"]):\n",
+    "        sub_df = df[df[\"label\"] == marker]\n",
+    "        ax = sub_df.plot.scatter(x=\"x0\", y=\"x1\", marker=marker, ax=ax, **kwargs)\n",
+    "    return ax\n",
+    "\n",
+    "ax = km_scatter(df, s=100, c=\"0.7\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "47686eee",
+   "metadata": {},
+   "source": [
+    "### Hard Problem\n",
+    "\n",
+    "Finding the best answer. What is the answer? Determing the centroids of the clusters.\n",
+    "\n",
+    "### Easier Problem\n",
+    "\n",
+    "Taking a random answer and make it a little better. Then repeat!\n",
+    "Downside? If randomization leads to very bad initial choice of centroids, that might lead to bad clustering (fewer clusters)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f8bde9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clusters = np.random.uniform(-5, 5, size=(3, 2))\n",
+    "clusters = pd.DataFrame(clusters, columns=[\"x0\", \"x1\"])\n",
+    "clusters[\"label\"] = [\"o\", \"+\", \"x\"]\n",
+    "\n",
+    "ax = km_scatter(df, s=100, c=\"0.7\")\n",
+    "km_scatter(clusters, s=200, c=\"red\", ax=ax)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a3fe986c",
+   "metadata": {},
+   "source": [
+    "Two variables for us to deal with:\n",
+    "1. clusters: contains location of centroids and a label for them\n",
+    "2. df: contains the actual data points"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cfa1f1aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clusters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f210c534",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a28466ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class KM:\n",
+    "    def __init__(self, df, clusters):\n",
+    "        # We make copies because we are going to keep changing the dataframe to \n",
+    "        # identify better clusters\n",
+    "        self.df = df.copy()\n",
+    "        self.clusters = clusters.copy()\n",
+    "        self.labels = clusters[\"label\"].values\n",
+    "        \n",
+    "    def plot(self):\n",
+    "        ax = km_scatter(self.df, color=\"0.7\", s=100)\n",
+    "        km_scatter(self.clusters, ax=ax, color=\"red\", s=200)\n",
+    "        \n",
+    "    def assign_points(self):\n",
+    "        \"\"\"\n",
+    "        compute Euclidean distance between each point and each centroids\n",
+    "        \"\"\"\n",
+    "        pass\n",
+    "    \n",
+    "    def update_centers(self):\n",
+    "        \"\"\"\n",
+    "        update centroids by taking mean of the points that are nearest to that\n",
+    "        particular centroid\n",
+    "        \"\"\"\n",
+    "        pass\n",
+    "\n",
+    "\"\"\"\n",
+    "High-level algorithm:\n",
+    "1. Start with random locations for centroids\n",
+    "2. Iterate over each data point:\n",
+    "    1. Find the distance (Euclidean distance) between current data point and each centroid.\n",
+    "    2. Find the minimum of those distances and the corresponding label.\n",
+    "    3. Assign current data point to the closest cluster centroid label.\n",
+    "4. Once all points are assigned, compute new centroid for each cluster. Iterate over \n",
+    "   each cluster:\n",
+    "    1. Extract subset of data points which got assigned to curr cluster label.\n",
+    "    2. Compute mean of all the assigned data points.\n",
+    "    3. Update cluster centroid.\n",
+    "5. Repeat steps 2 to 4 many times (iterative improvement).\n",
+    "\"\"\"\n",
+    "\n",
+    "# Creating object instance\n",
+    "km = KM(df, clusters)\n",
+    "km.plot()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "938a6cc5",
+   "metadata": {},
+   "source": [
+    "### `sklearn KMeans`\n",
+    "\n",
+    "- import statement:\n",
+    "```python\n",
+    "from sklearn.cluster import KMeans\n",
+    "```\n",
+    "- documentation: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html\n",
+    "\n",
+    "**Instantiation:**\n",
+    "`KMeans(n_clusters=<num>, n_init=<num>, max_iter=<num>)`\n",
+    "- `n_clusters`: number of clusters to be formed\n",
+    "- `n_init`: number of initial random seeds to try (to avoid downside of bad initial random choices)\n",
+    "- `max_iter`: maximum number of iterations for a single K-means run (single starting seed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "caa96a1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "km_cluster = \n",
+    "km_cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea51243c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "84e59c4a",
+   "metadata": {},
+   "source": [
+    "**Methods:**\n",
+    "1. `fit`: find good centroids\n",
+    "2. `transform`: give me the distances from each point to each centroid\n",
+    "3. `predict`: give me the chosen group labels\n",
+    "\n",
+    "**Attributes:**\n",
+    "- `<km object>.cluster_centers_`: coordinates of cluster centers\n",
+    "- `<km object>.inertia_`: sum of squared distances of samples to their closest cluster center"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26be1744",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# `fit`: find good centroids\n",
+    "km_cluster.fit(df)\n",
+    "# coordinates of cluster centers\n",
+    "km_cluster.cluster_centers_"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6ce05e61",
+   "metadata": {},
+   "source": [
+    "**Observeration:** 3 rows (because we have 3 clusters), and 2 columns (because the df had 2 columns)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2df977a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# `transform`: give me the distances from each point to each centroid\n",
+    "km_cluster.transform(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7cd8409e",
+   "metadata": {},
+   "source": [
+    "**Observations**: Each row corresponds to a row in df. 3 columns correspond to 3 distances to the centroids."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a65a976",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# `predict`: give me the chosen group labels\n",
+    "km_cluster.predict(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "240d995a",
+   "metadata": {},
+   "source": [
+    "### How many clusters do we need?\n",
+    "\n",
+    "- metric: `<km object>.inertia_`: sum of squared distances of samples to their closest cluster center"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8bf73d2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "km_cluster.inertia_"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "57b5ccc4",
+   "metadata": {},
+   "source": [
+    "**Observation**: we want \"inertia\" to be as small as possible."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae70b416",
+   "metadata": {},
+   "source": [
+    "### Elbow plot to determine `n_clusters`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "607a96b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a series with clusters 1 to 10 and corresponding values are equal to intertia \n",
+    "s = pd.Series(dtype=float)\n",
+    "\n",
+    "\n",
+    "\n",
+    "s"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "388cd23f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ax = s.plot.line(figsize=(6, 4))\n",
+    "ax.set_ylabel(\"Inertia\")\n",
+    "ax.set_xlabel(\"Number of clusters\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eab497cd",
+   "metadata": {},
+   "source": [
+    "**Observation**: there is an \"elbow\" around `n_clusters`=3."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e763d1c",
+   "metadata": {},
+   "source": [
+    "#### Will we always have a clear \"elbow\"?\n",
+    "\n",
+    "- Let's generate uniform random data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5ad30ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2 = pd.DataFrame(np.random.uniform(0, 10, (100, 2)))\n",
+    "\n",
+    "s = pd.Series(dtype=float)\n",
+    "\n",
+    "for num_clusters in range(1, 11):\n",
+    "    km = KMeans(num_clusters, n_init = 320)\n",
+    "    km.fit(df2)\n",
+    "    s.at[num_clusters] = km.inertia_\n",
+    "\n",
+    "ax = s.plot.line(figsize=(6, 4))\n",
+    "ax.set_ylabel(\"Inertia\")\n",
+    "ax.set_xlabel(\"Number of clusters\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6decdab-74a3-45b5-a408-e7b54bd992d9",
+   "metadata": {},
+   "source": [
+    "**Observation**: there is an \"elbow\" around `n_clusters`=3."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c6115b0-61df-4660-8355-3cd56bd94080",
+   "metadata": {},
+   "source": [
+    "#### Will we always have a clear \"elbow\"?\n",
+    "\n",
+    "- Let's generate uniform random data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "424743f8-41de-42b8-ab78-07901682ee84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2 = pd.DataFrame(np.random.uniform(0, 10, (100, 2)))\n",
+    "df2.plot.scatter(0, 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fba71303-d4c6-46d3-ae8e-f0e863d8e032",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s = pd.Series(dtype=float)\n",
+    "\n",
+    "for num_clusters in range(1, 11):\n",
+    "    km = KMeans(num_clusters, n_init = 320)\n",
+    "    km.fit(df2)\n",
+    "    s.at[num_clusters] = km.inertia_\n",
+    "\n",
+    "ax = s.plot.line(figsize=(6, 4))\n",
+    "ax.set_ylabel(\"Inertia\")\n",
+    "ax.set_xlabel(\"Number of clusters\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8af299e7",
+   "metadata": {},
+   "source": [
+    "### K-Means use cases:\n",
+    "\n",
+    "1. estimator\n",
+    "2. transformer:\n",
+    "    - sometimes we'll use an unsupervised learning technique (like k-means) to pre-process data, creating better inputs for a supervised learning technique (like logistic regression)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b99861d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_data():\n",
+    "    x, y = datasets.make_blobs(n_samples=250, centers=5, random_state=5)\n",
+    "    xcols = [\"x0\", \"x1\"]\n",
+    "    df1 = pd.DataFrame(x, columns=xcols)\n",
+    "    df1[\"y\"] = y > 0\n",
+    "\n",
+    "    df2 = pd.DataFrame(np.random.uniform(-10, 10, size=(250, 2)), columns=[\"x0\", \"x1\"])\n",
+    "    df2[\"y\"] = False\n",
+    "\n",
+    "    return pd.concat((df1, df2))\n",
+    "\n",
+    "train, test = train_test_split(make_data())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c1a0353f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.rcParams[\"font.size\"] = 16\n",
+    "fig, ax = plt.subplots(ncols=2, figsize=(10,4))\n",
+    "train.plot.scatter(x=\"x0\", y=\"x1\", c=train[\"y\"], vmin=-1, ax=ax[0])\n",
+    "test.plot.scatter(x=\"x0\", y=\"x1\", c=\"red\", ax=ax[1])\n",
+    "ax[0].set_title(\"Training Data\")\n",
+    "ax[1].set_title(\"Test Data\")\n",
+    "plt.subplots_adjust(wspace=0.4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "57800660",
+   "metadata": {},
+   "source": [
+    "#### Objective: use `LogisticRegression` to classify points as \"black\" or \"gray\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cba5b0b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = Pipeline([\n",
+    "    (\"km\", KMeans(10, n_init = 320)),\n",
+    "    (\"lr\", LogisticRegression()),\n",
+    "])\n",
+    "# TO DO: fit the model with train columns \"x0\", \"x1\" and test column y\n",
+    "\n",
+    "# TO DO: score the model with test columns \"x0\", \"x1\" and test column y\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e78a788c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = Pipeline([\n",
+    "    (\"km\", KMeans(10, n_init = 320)),\n",
+    "    (\"std\", StandardScaler()),\n",
+    "    (\"lr\", LogisticRegression()),\n",
+    "])\n",
+    "model.fit(train[[\"x0\", \"x1\"]], train[\"y\"])\n",
+    "model.score(test[[\"x0\", \"x1\"]], test[\"y\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0d506007",
+   "metadata": {},
+   "source": [
+    "### `StandardScaler` with `KMeans`\n",
+    "\n",
+    "Recall that `StandardScaler` should always be applied after applying `PolynomialFeatures` (from last lecture)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1229aad1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = datasets.make_blobs(centers=np.array([(0, 0), (0, 20), (3, 20)]))[0]\n",
+    "df = pd.DataFrame(x)\n",
+    "df.plot.scatter(x=0, y=1, figsize=(6, 4))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9f21a66d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "km_c = KMeans(2, n_init = 320)\n",
+    "km_c.fit(df)\n",
+    "km_c.predict(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7dc700d4",
+   "metadata": {},
+   "source": [
+    "#### `fit_predict(...)` is a shortcut for `fit` and `predict` method invocations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bc1d18d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "KMeans(2, n_init = 320).fit_predict(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2dfa6bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# -1 => white, 0 => gray, 1 => black\n",
+    "df.plot.scatter(x=0, y=1, figsize=(6, 4), c=KMeans(2, n_init = 320).fit_predict(df), vmin=-1, vmax=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "762f2882",
+   "metadata": {},
+   "source": [
+    "**Observation**: scale for columns are intentionally not specified."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4f99dfeb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d444185c",
+   "metadata": {},
+   "source": [
+    "Let's make a copy of the data. Assuming initial data for both columns is in \"km\", let's convert one column (`0`) into \"meters\". "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e437218",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2 = df.copy()\n",
+    "df2[0] *= 1000 # km => m\n",
+    "df2.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c99315a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.plot.scatter(x=0, y=1, figsize=(6,4), c=KMeans(2, n_init = 320).fit_predict(df2), vmin=-1, vmax=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3966ddd3",
+   "metadata": {},
+   "source": [
+    "**Observations**:\n",
+    "- One would expect to see the same clusters, but that is not happening here. Why?\n",
+    "    - x-axis difference is too high when compared to the y-axis difference\n",
+    "    - That is, KMeans doesn't get that x-axis has scaled data, whereas y-axis doesn't have scaled data\n",
+    "- This is not too far off from realistic datasets. \n",
+    "    - That is, real-world dataset columns might have difference units. \n",
+    "    - For example, one column might be representing temperature data where as another might be representing distance."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b63c9d9b",
+   "metadata": {},
+   "source": [
+    "#### Conclusion: `StandardScaler` should be applied before `KMeans`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c81ed04",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TO DO: write a pipeline with StandardScaler and KMeans with 2 clusters\n",
+    "\n",
+    "\n",
+    "\n",
+    "df2.plot.scatter(x=0, y=1, figsize=(6, 4), c=model.fit_predict(df2), vmin=-1, vmax=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "359dd107",
+   "metadata": {},
+   "source": [
+    "### Wisconsin counties example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8847306e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = gpd.read_file(\"counties.geojson\")\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "377e9bb0",
+   "metadata": {},
+   "source": [
+    "#### If we want to use \"POP100\", \"AREALAND\", \"developed\", \"forest\", \"pasture\", \"crops\" for clustering, what transformer should we use? \n",
+    "\n",
+    "- StandardScaler."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15f0b21c",
+   "metadata": {},
+   "source": [
+    "### Goal here: cluster counties based on similar land usage."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55013d0a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.plot()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a199a2af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.plot(column=\"crops\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8735a7bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.plot(column=\"forest\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a42394cb",
+   "metadata": {},
+   "source": [
+    "### KMeans"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8b3c831",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xcols = [\"developed\", \"forest\", \"pasture\", \"crops\"]\n",
+    "\n",
+    "# instantiate\n",
+    "km_c = KMeans(4, n_init = 320)\n",
+    "# fit\n",
+    "km_c.fit(df[xcols])\n",
+    "# predict\n",
+    "clusters = km_c.predict(df[xcols])\n",
+    "\n",
+    "print(km_c.inertia_)\n",
+    "print(clusters)\n",
+    "\n",
+    "df.plot(column=clusters, cmap=\"tab10\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "01189d61",
+   "metadata": {},
+   "source": [
+    "**Observation**: cluster number can be random. That is, if you re-run the above cell twice, you will get different number for each cluster."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1764b5a6",
+   "metadata": {},
+   "source": [
+    "### Agglomerative clustering\n",
+    "\n",
+    "- import statement\n",
+    "```python\n",
+    "from sklearn.cluster import AgglomerativeClustering\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d7954a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xcols = [\"developed\", \"forest\", \"pasture\", \"crops\"]\n",
+    "\n",
+    "# instantiate\n",
+    "km_c = AgglomerativeClustering(4)\n",
+    "# fit\n",
+    "km_c.fit(df[xcols])\n",
+    "# predict\n",
+    "clusters = km_c.predict(df[xcols])\n",
+    "\n",
+    "print(km_c.inertia_)\n",
+    "print(clusters)\n",
+    "\n",
+    "df.plot(column=clusters, cmap=\"tab10\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6f825891",
+   "metadata": {},
+   "source": [
+    "**Observations**: \n",
+    "- no centroids => no inertia => no elbow plots (how do we pick cluster count?):\n",
+    "    - AttributeError: 'AgglomerativeClustering' object has no attribute 'predict'\n",
+    "- no `predict` method, but there is `fit_predict`:\n",
+    "    - AttributeError: 'AgglomerativeClustering' object has no attribute 'predict'\n",
+    "    - why?\n",
+    "        - because each point could lead to a completely different tree\n",
+    "        - remember unlike KMeans (which is top-down), AgglomerativeClustering is bottom-up"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e16c6131",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xcols = [\"developed\", \"forest\", \"pasture\", \"crops\"]\n",
+    "\n",
+    "# instantiate\n",
+    "km_c = AgglomerativeClustering(4)\n",
+    "# fit_predict\n",
+    "clusters = km_c.fit_predict(df[xcols])\n",
+    "\n",
+    "# print(km_c.inertia_)\n",
+    "print(clusters)\n",
+    "\n",
+    "df.plot(column=clusters, cmap=\"tab10\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fc2bf08-a0e3-49a3-b089-af0f99f83613",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/lecture_material/24-clustering/24-clustering_002.ipynb b/lecture_material/24-clustering/24-clustering_002.ipynb
new file mode 100644
index 0000000..1550e87
--- /dev/null
+++ b/lecture_material/24-clustering/24-clustering_002.ipynb
@@ -0,0 +1,916 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "035e9e2c-9781-4b9c-8395-be9e55e4e082",
+   "metadata": {},
+   "source": [
+    "# Clustering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cbd48a28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import geopandas as gpd\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "from sklearn import datasets\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.linear_model import LinearRegression, LogisticRegression\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.preprocessing import PolynomialFeatures\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "\n",
+    "# new import statements\n",
+    "from sklearn.cluster import KMeans, AgglomerativeClustering"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e72ea2f4",
+   "metadata": {},
+   "source": [
+    "# Unsupervised Machine Learning: Clustering\n",
+    "\n",
+    "- In classification (supervised), we try to find boundaries/rules to separate points according to pre-determined labels.\n",
+    "- In clustering, the algorithm chooses the labels.  Goal is to choose labels so that similar rows get labeled the same.\n",
+    "\n",
+    "### K-Means Clustering\n",
+    "\n",
+    "- K: number of clusters:\n",
+    "    - 3-Means => 3 clusters\n",
+    "    - 4-Means => 4 clusters, and so on\n",
+    "- Means: we will find centroids (aka means aka averages) to create clusters\n",
+    "\n",
+    "- import statement:\n",
+    "```python\n",
+    "from sklearn.cluster import KMeans\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a0ad5a5",
+   "metadata": {},
+   "source": [
+    "#### Iterative algorithm for K-Means"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b83aaf3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate random data\n",
+    "x, y = datasets.make_blobs(n_samples=100, centers=3, cluster_std=1.2, random_state=3)\n",
+    "df = pd.DataFrame(x, columns=[\"x0\", \"x1\"])\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fbced908",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def km_scatter(df, **kwargs):\n",
+    "    \"\"\"\n",
+    "    Produces scatter plot visualizations with x0 on x-axis and y0 on y-axis.\n",
+    "    It can also plot the centroids for clusters.\n",
+    "    Parameters:\n",
+    "        x0 => x-axis\n",
+    "        x1 => y-axis\n",
+    "        cluster => marker type\n",
+    "    \"\"\"\n",
+    "    ax = kwargs.pop(\"ax\", None)\n",
+    "    if not \"label\" in df.columns:\n",
+    "        return df.plot.scatter(x=\"x0\", y=\"x1\", marker=\"$?$\", ax=ax, **kwargs)\n",
+    "\n",
+    "    for marker in set(df[\"label\"]):\n",
+    "        sub_df = df[df[\"label\"] == marker]\n",
+    "        ax = sub_df.plot.scatter(x=\"x0\", y=\"x1\", marker=marker, ax=ax, **kwargs)\n",
+    "    return ax\n",
+    "\n",
+    "ax = km_scatter(df, s=100, c=\"0.7\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "47686eee",
+   "metadata": {},
+   "source": [
+    "### Hard Problem\n",
+    "\n",
+    "Finding the best answer. What is the answer? Determing the centroids of the clusters.\n",
+    "\n",
+    "### Easier Problem\n",
+    "\n",
+    "Taking a random answer and make it a little better. Then repeat!\n",
+    "Downside? If randomization leads to very bad initial choice of centroids, that might lead to bad clustering (fewer clusters)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f8bde9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clusters = np.random.uniform(-5, 5, size=(3, 2))\n",
+    "clusters = pd.DataFrame(clusters, columns=[\"x0\", \"x1\"])\n",
+    "clusters[\"label\"] = [\"o\", \"+\", \"x\"]\n",
+    "\n",
+    "ax = km_scatter(df, s=100, c=\"0.7\")\n",
+    "km_scatter(clusters, s=200, c=\"red\", ax=ax)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a3fe986c",
+   "metadata": {},
+   "source": [
+    "Two variables for us to deal with:\n",
+    "1. clusters: contains location of centroids and a label for them\n",
+    "2. df: contains the actual data points"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cfa1f1aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clusters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f210c534",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a28466ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class KM:\n",
+    "    def __init__(self, df, clusters):\n",
+    "        # We make copies because we are going to keep changing the dataframe to \n",
+    "        # identify better clusters\n",
+    "        self.df = df.copy()\n",
+    "        self.clusters = clusters.copy()\n",
+    "        self.labels = clusters[\"label\"].values\n",
+    "        \n",
+    "    def plot(self):\n",
+    "        ax = km_scatter(self.df, color=\"0.7\", s=100)\n",
+    "        km_scatter(self.clusters, ax=ax, color=\"red\", s=200)\n",
+    "        \n",
+    "    def assign_points(self):\n",
+    "        \"\"\"\n",
+    "        compute Euclidean distance between each point and each centroids\n",
+    "        \"\"\"\n",
+    "        pass\n",
+    "    \n",
+    "    def update_centers(self):\n",
+    "        \"\"\"\n",
+    "        update centroids by taking mean of the points that are nearest to that\n",
+    "        particular centroid\n",
+    "        \"\"\"\n",
+    "        pass\n",
+    "\n",
+    "\"\"\"\n",
+    "High-level algorithm:\n",
+    "1. Start with random locations for centroids\n",
+    "2. Iterate over each data point:\n",
+    "    1. Find the distance (Euclidean distance) between current data point and each centroid.\n",
+    "    2. Find the minimum of those distances and the corresponding label.\n",
+    "    3. Assign current data point to the closest cluster centroid label.\n",
+    "4. Once all points are assigned, compute new centroid for each cluster. Iterate over \n",
+    "   each cluster:\n",
+    "    1. Extract subset of data points which got assigned to curr cluster label.\n",
+    "    2. Compute mean of all the assigned data points.\n",
+    "    3. Update cluster centroid.\n",
+    "5. Repeat steps 2 to 4 many times (iterative improvement).\n",
+    "\"\"\"\n",
+    "\n",
+    "# Creating object instance\n",
+    "km = KM(df, clusters)\n",
+    "km.plot()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "938a6cc5",
+   "metadata": {},
+   "source": [
+    "### `sklearn KMeans`\n",
+    "\n",
+    "- import statement:\n",
+    "```python\n",
+    "from sklearn.cluster import KMeans\n",
+    "```\n",
+    "- documentation: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html\n",
+    "\n",
+    "**Instantiation:**\n",
+    "`KMeans(n_clusters=<num>, n_init=<num>, max_iter=<num>)`\n",
+    "- `n_clusters`: number of clusters to be formed\n",
+    "- `n_init`: number of initial random seeds to try (to avoid downside of bad initial random choices)\n",
+    "- `max_iter`: maximum number of iterations for a single K-means run (single starting seed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "caa96a1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "km_cluster = \n",
+    "km_cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea51243c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "84e59c4a",
+   "metadata": {},
+   "source": [
+    "**Methods:**\n",
+    "1. `fit`: find good centroids\n",
+    "2. `transform`: give me the distances from each point to each centroid\n",
+    "3. `predict`: give me the chosen group labels\n",
+    "\n",
+    "**Attributes:**\n",
+    "- `<km object>.cluster_centers_`: coordinates of cluster centers\n",
+    "- `<km object>.inertia_`: sum of squared distances of samples to their closest cluster center"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26be1744",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# `fit`: find good centroids\n",
+    "km_cluster.fit(df)\n",
+    "# coordinates of cluster centers\n",
+    "km_cluster.cluster_centers_"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6ce05e61",
+   "metadata": {},
+   "source": [
+    "**Observeration:** 3 rows (because we have 3 clusters), and 2 columns (because the df had 2 columns)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2df977a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# `transform`: give me the distances from each point to each centroid\n",
+    "km_cluster.transform(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7cd8409e",
+   "metadata": {},
+   "source": [
+    "**Observations**: Each row corresponds to a row in df. 3 columns correspond to 3 distances to the centroids."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a65a976",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# `predict`: give me the chosen group labels\n",
+    "km_cluster.predict(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "240d995a",
+   "metadata": {},
+   "source": [
+    "### How many clusters do we need?\n",
+    "\n",
+    "- metric: `<km object>.inertia_`: sum of squared distances of samples to their closest cluster center"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8bf73d2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "km_cluster.inertia_"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "57b5ccc4",
+   "metadata": {},
+   "source": [
+    "**Observation**: we want \"inertia\" to be as small as possible."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae70b416",
+   "metadata": {},
+   "source": [
+    "### Elbow plot to determine `n_clusters`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "607a96b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a series with clusters 1 to 10 and corresponding values are equal to intertia \n",
+    "s = pd.Series(dtype=float)\n",
+    "\n",
+    "\n",
+    "\n",
+    "s"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "388cd23f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ax = s.plot.line(figsize=(6, 4))\n",
+    "ax.set_ylabel(\"Inertia\")\n",
+    "ax.set_xlabel(\"Number of clusters\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eab497cd",
+   "metadata": {},
+   "source": [
+    "**Observation**: there is an \"elbow\" around `n_clusters`=3."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e763d1c",
+   "metadata": {},
+   "source": [
+    "#### Will we always have a clear \"elbow\"?\n",
+    "\n",
+    "- Let's generate uniform random data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5ad30ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2 = pd.DataFrame(np.random.uniform(0, 10, (100, 2)))\n",
+    "\n",
+    "s = pd.Series(dtype=float)\n",
+    "\n",
+    "for num_clusters in range(1, 11):\n",
+    "    km = KMeans(num_clusters, n_init = 320)\n",
+    "    km.fit(df2)\n",
+    "    s.at[num_clusters] = km.inertia_\n",
+    "\n",
+    "ax = s.plot.line(figsize=(6, 4))\n",
+    "ax.set_ylabel(\"Inertia\")\n",
+    "ax.set_xlabel(\"Number of clusters\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6decdab-74a3-45b5-a408-e7b54bd992d9",
+   "metadata": {},
+   "source": [
+    "**Observation**: there is an \"elbow\" around `n_clusters`=3."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c6115b0-61df-4660-8355-3cd56bd94080",
+   "metadata": {},
+   "source": [
+    "#### Will we always have a clear \"elbow\"?\n",
+    "\n",
+    "- Let's generate uniform random data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "424743f8-41de-42b8-ab78-07901682ee84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2 = pd.DataFrame(np.random.uniform(0, 10, (100, 2)))\n",
+    "df2.plot.scatter(0, 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fba71303-d4c6-46d3-ae8e-f0e863d8e032",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s = pd.Series(dtype=float)\n",
+    "\n",
+    "for num_clusters in range(1, 11):\n",
+    "    km = KMeans(num_clusters, n_init = 320)\n",
+    "    km.fit(df2)\n",
+    "    s.at[num_clusters] = km.inertia_\n",
+    "\n",
+    "ax = s.plot.line(figsize=(6, 4))\n",
+    "ax.set_ylabel(\"Inertia\")\n",
+    "ax.set_xlabel(\"Number of clusters\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8af299e7",
+   "metadata": {},
+   "source": [
+    "### K-Means use cases:\n",
+    "\n",
+    "1. estimator\n",
+    "2. transformer:\n",
+    "    - sometimes we'll use an unsupervised learning technique (like k-means) to pre-process data, creating better inputs for a supervised learning technique (like logistic regression)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b99861d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_data():\n",
+    "    x, y = datasets.make_blobs(n_samples=250, centers=5, random_state=5)\n",
+    "    xcols = [\"x0\", \"x1\"]\n",
+    "    df1 = pd.DataFrame(x, columns=xcols)\n",
+    "    df1[\"y\"] = y > 0\n",
+    "\n",
+    "    df2 = pd.DataFrame(np.random.uniform(-10, 10, size=(250, 2)), columns=[\"x0\", \"x1\"])\n",
+    "    df2[\"y\"] = False\n",
+    "\n",
+    "    return pd.concat((df1, df2))\n",
+    "\n",
+    "train, test = train_test_split(make_data())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c1a0353f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.rcParams[\"font.size\"] = 16\n",
+    "fig, ax = plt.subplots(ncols=2, figsize=(10,4))\n",
+    "train.plot.scatter(x=\"x0\", y=\"x1\", c=train[\"y\"], vmin=-1, ax=ax[0])\n",
+    "test.plot.scatter(x=\"x0\", y=\"x1\", c=\"red\", ax=ax[1])\n",
+    "ax[0].set_title(\"Training Data\")\n",
+    "ax[1].set_title(\"Test Data\")\n",
+    "plt.subplots_adjust(wspace=0.4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "57800660",
+   "metadata": {},
+   "source": [
+    "#### Objective: use `LogisticRegression` to classify points as \"black\" or \"gray\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cba5b0b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = Pipeline([\n",
+    "    (\"km\", KMeans(10, n_init = 320)),\n",
+    "    (\"lr\", LogisticRegression()),\n",
+    "])\n",
+    "# TO DO: fit the model with train columns \"x0\", \"x1\" and test column y\n",
+    "\n",
+    "# TO DO: score the model with test columns \"x0\", \"x1\" and test column y\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e78a788c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = Pipeline([\n",
+    "    (\"km\", KMeans(10, n_init = 320)),\n",
+    "    (\"std\", StandardScaler()),\n",
+    "    (\"lr\", LogisticRegression()),\n",
+    "])\n",
+    "model.fit(train[[\"x0\", \"x1\"]], train[\"y\"])\n",
+    "model.score(test[[\"x0\", \"x1\"]], test[\"y\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0d506007",
+   "metadata": {},
+   "source": [
+    "### `StandardScaler` with `KMeans`\n",
+    "\n",
+    "Recall that `StandardScaler` should always be applied after applying `PolynomialFeatures` (from last lecture)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1229aad1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = datasets.make_blobs(centers=np.array([(0, 0), (0, 20), (3, 20)]))[0]\n",
+    "df = pd.DataFrame(x)\n",
+    "df.plot.scatter(x=0, y=1, figsize=(6, 4))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9f21a66d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "km_c = KMeans(2, n_init = 320)\n",
+    "km_c.fit(df)\n",
+    "km_c.predict(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7dc700d4",
+   "metadata": {},
+   "source": [
+    "#### `fit_predict(...)` is a shortcut for `fit` and `predict` method invocations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bc1d18d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "KMeans(2, n_init = 320).fit_predict(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2dfa6bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# -1 => white, 0 => gray, 1 => black\n",
+    "df.plot.scatter(x=0, y=1, figsize=(6, 4), c=KMeans(2, n_init = 320).fit_predict(df), vmin=-1, vmax=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "762f2882",
+   "metadata": {},
+   "source": [
+    "**Observation**: scale for columns are intentionally not specified."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4f99dfeb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d444185c",
+   "metadata": {},
+   "source": [
+    "Let's make a copy of the data. Assuming initial data for both columns is in \"km\", let's convert one column (`0`) into \"meters\". "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e437218",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2 = df.copy()\n",
+    "df2[0] *= 1000 # km => m\n",
+    "df2.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c99315a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.plot.scatter(x=0, y=1, figsize=(6,4), c=KMeans(2, n_init = 320).fit_predict(df2), vmin=-1, vmax=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3966ddd3",
+   "metadata": {},
+   "source": [
+    "**Observations**:\n",
+    "- One would expect to see the same clusters, but that is not happening here. Why?\n",
+    "    - x-axis difference is too high when compared to the y-axis difference\n",
+    "    - That is, KMeans doesn't get that x-axis has scaled data, whereas y-axis doesn't have scaled data\n",
+    "- This is not too far off from realistic datasets. \n",
+    "    - That is, real-world dataset columns might have difference units. \n",
+    "    - For example, one column might be representing temperature data where as another might be representing distance."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b63c9d9b",
+   "metadata": {},
+   "source": [
+    "#### Conclusion: `StandardScaler` should be applied before `KMeans`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c81ed04",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TO DO: write a pipeline with StandardScaler and KMeans with 2 clusters\n",
+    "\n",
+    "\n",
+    "\n",
+    "df2.plot.scatter(x=0, y=1, figsize=(6, 4), c=model.fit_predict(df2), vmin=-1, vmax=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "359dd107",
+   "metadata": {},
+   "source": [
+    "### Wisconsin counties example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8847306e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = gpd.read_file(\"counties.geojson\")\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "377e9bb0",
+   "metadata": {},
+   "source": [
+    "#### If we want to use \"POP100\", \"AREALAND\", \"developed\", \"forest\", \"pasture\", \"crops\" for clustering, what transformer should we use? \n",
+    "\n",
+    "- StandardScaler."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15f0b21c",
+   "metadata": {},
+   "source": [
+    "### Goal here: cluster counties based on similar land usage."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55013d0a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.plot()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a199a2af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.plot(column=\"crops\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8735a7bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.plot(column=\"forest\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a42394cb",
+   "metadata": {},
+   "source": [
+    "### KMeans"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8b3c831",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xcols = [\"developed\", \"forest\", \"pasture\", \"crops\"]\n",
+    "\n",
+    "# instantiate\n",
+    "km_c = KMeans(4, n_init = 320)\n",
+    "# fit\n",
+    "km_c.fit(df[xcols])\n",
+    "# predict\n",
+    "clusters = km_c.predict(df[xcols])\n",
+    "\n",
+    "print(km_c.inertia_)\n",
+    "print(clusters)\n",
+    "\n",
+    "df.plot(column=clusters, cmap=\"tab10\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "01189d61",
+   "metadata": {},
+   "source": [
+    "**Observation**: cluster number can be random. That is, if you re-run the above cell twice, you will get different number for each cluster."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1764b5a6",
+   "metadata": {},
+   "source": [
+    "### Agglomerative clustering\n",
+    "\n",
+    "- import statement\n",
+    "```python\n",
+    "from sklearn.cluster import AgglomerativeClustering\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d7954a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xcols = [\"developed\", \"forest\", \"pasture\", \"crops\"]\n",
+    "\n",
+    "# instantiate\n",
+    "km_c = AgglomerativeClustering(4)\n",
+    "# fit\n",
+    "km_c.fit(df[xcols])\n",
+    "# predict\n",
+    "clusters = km_c.predict(df[xcols])\n",
+    "\n",
+    "print(km_c.inertia_)\n",
+    "print(clusters)\n",
+    "\n",
+    "df.plot(column=clusters, cmap=\"tab10\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6f825891",
+   "metadata": {},
+   "source": [
+    "**Observations**: \n",
+    "- no centroids => no inertia => no elbow plots (how do we pick cluster count?):\n",
+    "    - AttributeError: 'AgglomerativeClustering' object has no attribute 'predict'\n",
+    "- no `predict` method, but there is `fit_predict`:\n",
+    "    - AttributeError: 'AgglomerativeClustering' object has no attribute 'predict'\n",
+    "    - why?\n",
+    "        - because each point could lead to a completely different tree\n",
+    "        - remember unlike KMeans (which is top-down), AgglomerativeClustering is bottom-up"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e16c6131",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xcols = [\"developed\", \"forest\", \"pasture\", \"crops\"]\n",
+    "\n",
+    "# instantiate\n",
+    "km_c = AgglomerativeClustering(4)\n",
+    "# fit_predict\n",
+    "clusters = km_c.fit_predict(df[xcols])\n",
+    "\n",
+    "# print(km_c.inertia_)\n",
+    "print(clusters)\n",
+    "\n",
+    "df.plot(column=clusters, cmap=\"tab10\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fc2bf08-a0e3-49a3-b089-af0f99f83613",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab