From 564b910358ccc8f506abf33e98383445a15b53d2 Mon Sep 17 00:00:00 2001
From: TYLER CARAZA-HARTER <tharter@cs544-tharter.cs.wisc.edu>
Date: Mon, 3 Mar 2025 20:48:44 -0600
Subject: [PATCH] part 3

---
 lec/15-sql/lec1.ipynb | 487 ++++++++++++++++++++++++++++++++++++------
 p4/README.md          |  48 ++---
 p4/docker-compose.yml |   4 +-
 p4/lender.proto       |   9 +-
 4 files changed, 449 insertions(+), 99 deletions(-)

diff --git a/lec/15-sql/lec1.ipynb b/lec/15-sql/lec1.ipynb
index 6c084b4..905d1aa 100644
--- a/lec/15-sql/lec1.ipynb
+++ b/lec/15-sql/lec1.ipynb
@@ -14,7 +14,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "id": "8cabbb35-3d75-44ee-886c-6aa871a01d68",
    "metadata": {},
    "outputs": [
@@ -24,7 +24,7 @@
        "[]"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -37,17 +37,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "b68ba375-c9c9-4677-9d9d-310d1927a276",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<sqlalchemy.engine.cursor.CursorResult at 0x765a7c3ddb00>"
+       "<sqlalchemy.engine.cursor.CursorResult at 0x7c45d0474de0>"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -69,17 +69,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
    "id": "00a5b5b9-8e91-4d90-99ad-e85a1756ea88",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<sqlalchemy.engine.cursor.CursorResult at 0x765a714c6820>"
+       "<sqlalchemy.engine.cursor.CursorResult at 0x7c45c9711e80>"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -97,7 +97,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 5,
    "id": "9739d58c-8004-4844-a609-7ed95bdbf9aa",
    "metadata": {},
    "outputs": [
@@ -107,7 +107,7 @@
        "[('accounts',), ('users',)]"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -120,17 +120,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 6,
    "id": "1fc2171c-9f09-4b40-b5e0-fcb84870ec7d",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<sqlalchemy.engine.cursor.CursorResult at 0x765a714c6f90>"
+       "<sqlalchemy.engine.cursor.CursorResult at 0x7c45c97122e0>"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -143,7 +143,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 7,
    "id": "a96a9978-0927-4886-bd72-225150f9a5a2",
    "metadata": {},
    "outputs": [],
@@ -155,7 +155,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 8,
    "id": "0dbc816b-4f66-4b19-bcb6-e1b5212ef469",
    "metadata": {},
    "outputs": [
@@ -165,7 +165,7 @@
        "[(1, 'tyler', None)]"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -179,7 +179,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 9,
    "id": "45c24702-0285-43fe-ae0e-b9d01adc2a37",
    "metadata": {},
    "outputs": [],
@@ -189,17 +189,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 10,
    "id": "2a18a93d-20c1-4f02-875a-e6154ad7aef5",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<sqlalchemy.engine.cursor.CursorResult at 0x765a842ece50>"
+       "<sqlalchemy.engine.cursor.CursorResult at 0x7c45c9712d60>"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -213,7 +213,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 11,
    "id": "753e807c-51d8-4190-9e8b-92e95bc8030b",
    "metadata": {},
    "outputs": [],
@@ -223,7 +223,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 12,
    "id": "ca9c9379-ba53-42f0-9e11-eed9921adc01",
    "metadata": {},
    "outputs": [],
@@ -238,7 +238,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 13,
    "id": "75ae1cff-684a-4c04-9ce3-f619edea898c",
    "metadata": {},
    "outputs": [],
@@ -248,7 +248,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 14,
    "id": "de914434-5eb7-465b-aec4-8bec6953b623",
    "metadata": {},
    "outputs": [],
@@ -265,7 +265,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 15,
    "id": "4c24c47e-8f03-4b84-9647-6fd1559a4b0b",
    "metadata": {},
    "outputs": [],
@@ -283,7 +283,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 16,
    "id": "a0f419e7-b0d7-4bf1-94ad-4c4ca961ada6",
    "metadata": {},
    "outputs": [
@@ -293,7 +293,7 @@
        "447367"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -305,7 +305,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 17,
    "id": "96846d26-3aa8-414a-a925-eafa9fe60f50",
    "metadata": {},
    "outputs": [],
@@ -323,7 +323,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 18,
    "id": "791ad3aa-0a41-4c5c-b1ee-6d9b5b95e023",
    "metadata": {},
    "outputs": [
@@ -331,8 +331,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "remaining: -2\n",
-      "rollback!\n"
+      "remaining: 5\n",
+      "commit!\n"
      ]
     }
    ],
@@ -359,7 +359,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
    "id": "10f7d983-69fc-4800-b529-b7f19e0cdb73",
    "metadata": {},
    "outputs": [],
@@ -377,7 +377,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 20,
    "id": "ed5f92fc-ac76-4307-bd0c-6b714ed5a699",
    "metadata": {},
    "outputs": [
@@ -463,7 +463,7 @@
        "7   8      Preapproval request approved but not accepted"
       ]
      },
-     "execution_count": 38,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -478,7 +478,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 21,
    "id": "8349be60-02f4-43bb-9cf2-568bc70d2c75",
    "metadata": {},
    "outputs": [
@@ -643,7 +643,7 @@
        "9             1     NaN  "
       ]
      },
-     "execution_count": 39,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -659,7 +659,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
    "id": "f99ed83c-d50c-43f7-bf3c-7307bb30801b",
    "metadata": {},
    "outputs": [],
@@ -669,7 +669,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 23,
    "id": "d2fe1388-b308-4f8c-8a63-c426c0f1b787",
    "metadata": {},
    "outputs": [],
@@ -679,7 +679,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 24,
    "id": "0da82d35-5e7f-481b-a3fe-9d828b541794",
    "metadata": {},
    "outputs": [
@@ -777,7 +777,7 @@
        "9        Exempt           405.0"
       ]
      },
-     "execution_count": 42,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -793,7 +793,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 25,
    "id": "4ee4b0cb-7671-4570-9e18-fcd4b8c0394c",
    "metadata": {},
    "outputs": [
@@ -886,7 +886,7 @@
        "3             1   7291000.0  "
       ]
      },
-     "execution_count": 43,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -902,7 +902,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 26,
    "id": "d64a2a5e-e2d4-42a3-a894-9283160d2636",
    "metadata": {},
    "outputs": [
@@ -1007,7 +1007,7 @@
        "4             2   None  "
       ]
      },
-     "execution_count": 48,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1024,7 +1024,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 27,
    "id": "c01b796c-ded3-4e11-81e7-69d7660da9cb",
    "metadata": {},
    "outputs": [
@@ -1117,7 +1117,7 @@
        "4   63735000.0          2.99  "
       ]
      },
-     "execution_count": 57,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1136,7 +1136,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 34,
    "id": "b7d4a687-70bf-46e7-a715-013977358d05",
    "metadata": {},
    "outputs": [
@@ -1197,7 +1197,7 @@
        "0   3  Refinancing  "
       ]
      },
-     "execution_count": 62,
+     "execution_count": 34,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1214,52 +1214,413 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 35,
    "id": "fc73517c-cf57-4a91-bb9d-2fbfc76544d5",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>COUNT(*)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>447367</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   COUNT(*)\n",
+       "0    447367"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# how many rows are in the table?  Practice COUNT(*)."
+    "# how many rows are in the table?  Practice COUNT(*).\n",
+    "pd.read_sql(\"\"\"\n",
+    "SELECT COUNT(*)\n",
+    "FROM loans\n",
+    "\"\"\", conn)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 37,
    "id": "e91feeee-8689-4f57-a991-f6ca3fee2a6d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>COUNT(income)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>399948</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   COUNT(income)\n",
+       "0         399948"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# how many non-null values are in the income column?  Practice COUNT(column).\n",
+    "pd.read_sql(\"\"\"\n",
+    "SELECT COUNT(income)\n",
+    "FROM loans\n",
+    "\"\"\", conn)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "d688a1a3-3740-4dcf-8de5-b8f9f3e928b3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>loan_type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Conventional</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>FHA-insured</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>VA-guaranteed</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>FSA/RHS-guaranteed</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   id           loan_type\n",
+       "0   1        Conventional\n",
+       "1   2         FHA-insured\n",
+       "2   3       VA-guaranteed\n",
+       "3   4  FSA/RHS-guaranteed"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# how many non-null values are in the income column?  Practice COUNT(column)."
+    "pd.read_sql(\"\"\"\n",
+    "SELECT *\n",
+    "FROM loan_types\n",
+    "\"\"\", conn)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 49,
    "id": "12333532-0618-4ed4-b71b-260a4f35e581",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>AVG(interest_rate)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2.21657</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   AVG(interest_rate)\n",
+       "0             2.21657"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# what is the average interest rate for loans of type \"Conventional\"?  Practice AVG."
+    "# what is the average interest rate for loans of type \"Conventional\"?  Practice AVG.\n",
+    "pd.read_sql(\"\"\"\n",
+    "SELECT AVG(interest_rate)\n",
+    "FROM loans\n",
+    "INNER JOIN loan_types ON loans.loan_type = loan_types.id\n",
+    "WHERE loan_types.loan_type = 'Conventional'\n",
+    "\"\"\", conn)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 51,
    "id": "23c00af3-e385-435a-8bf6-f5cfe64f02db",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>loan_type</th>\n",
+       "      <th>AVG(interest_rate)</th>\n",
+       "      <th>COUNT(*)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Conventional</td>\n",
+       "      <td>2.216570</td>\n",
+       "      <td>389217</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>VA-guaranteed</td>\n",
+       "      <td>1.919140</td>\n",
+       "      <td>24551</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>FHA-insured</td>\n",
+       "      <td>2.211670</td>\n",
+       "      <td>30496</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>FSA/RHS-guaranteed</td>\n",
+       "      <td>2.523942</td>\n",
+       "      <td>3103</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            loan_type  AVG(interest_rate)  COUNT(*)\n",
+       "0        Conventional            2.216570    389217\n",
+       "1       VA-guaranteed            1.919140     24551\n",
+       "2         FHA-insured            2.211670     30496\n",
+       "3  FSA/RHS-guaranteed            2.523942      3103"
+      ]
+     },
+     "execution_count": 51,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# how many loans are there of each type?  Practice GROUP BY."
+    "# how many loans are there of each type?  Practice GROUP BY.\n",
+    "pd.read_sql(\"\"\"\n",
+    "SELECT loan_types.loan_type, AVG(interest_rate), COUNT(*)\n",
+    "FROM loans\n",
+    "INNER JOIN loan_types ON loans.loan_type = loan_types.id\n",
+    "GROUP BY loan_types.loan_type\n",
+    "\"\"\", conn)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 53,
    "id": "2400a6a4-7056-47e0-b202-3bbb85a77b2f",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>loan_type</th>\n",
+       "      <th>AVG(interest_rate)</th>\n",
+       "      <th>count</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Conventional</td>\n",
+       "      <td>2.21657</td>\n",
+       "      <td>389217</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>VA-guaranteed</td>\n",
+       "      <td>1.91914</td>\n",
+       "      <td>24551</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>FHA-insured</td>\n",
+       "      <td>2.21167</td>\n",
+       "      <td>30496</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       loan_type  AVG(interest_rate)   count\n",
+       "0   Conventional             2.21657  389217\n",
+       "1  VA-guaranteed             1.91914   24551\n",
+       "2    FHA-insured             2.21167   30496"
+      ]
+     },
+     "execution_count": 53,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# which loan types appear at least 10,000 times?  Practice HAVING."
+    "# which loan types appear at least 10,000 times?  Practice HAVING.\n",
+    "pd.read_sql(\"\"\"\n",
+    "SELECT loan_types.loan_type, AVG(interest_rate), COUNT(*) as count\n",
+    "FROM loans\n",
+    "INNER JOIN loan_types ON loans.loan_type = loan_types.id\n",
+    "GROUP BY loan_types.loan_type\n",
+    "HAVING count >= 10000\n",
+    "\"\"\", conn)"
    ]
   }
  ],
diff --git a/p4/README.md b/p4/README.md
index 496ecc0..da4fef1 100644
--- a/p4/README.md
+++ b/p4/README.md
@@ -37,7 +37,7 @@ python3 client.py CalcAvgLoan -c <county_code>
 
 ### Docker Compose
 
-Take a look at the provided Docker compose file. There are several services, including `datanodes` with 3 replicas, a `namenode`, a `SQL server`, a `gRPC Server`. The NameNode service will serve at the host of `boss` within the docker compose network.
+Take a look at the provided Docker compose file. There are several services, including 3 `datanodes`, a `namenode`, a `SQL server`, a `gRPC Server`. The NameNode service will serve at the host of `boss` within the docker compose network.
 
 ### gRPC
 
@@ -63,25 +63,12 @@ variable.  You can set it to p4 in your environment:
 export PROJECT=p4
 ```
 
-
 **Hint 1:** The command `docker logs <container-name> -f` might be very useful for troubleshooting. It allows you to view real-time output from a specific container.
 
 **Hint 2:** Think about whether there is any .sh script that will help you quickly test code changes.  For example, you may want it to rebuild your Dockerfiles, cleanup an old Compose cluster, and deploy a new cluster.
 
 **Hint 3:** If you're low on disk space, consider running `docker system prune -a --volumes -f`
 
-<!--
-**Hint 3:** You might find it really helpful to use these command below to clean up the disk space occupied by Docker iamges/containers/networks/volumes artifacts. during the development of this project.
-
-```bash
-docker image prune -a  -f
-docker container prune  -f
-docker network prune  -f
-docker volume prune  -f
-docker system prune -a --volumes -f #Equivalent to the combination of all cmd above
-```
--->
-
 ## Part 1: `DbToHdfs` gRPC Call
 
 In this part, your task is to implement the `DbToHdfs` gRPC call (you can find the interface definition in the proto file).
@@ -106,11 +93,11 @@ In this part, your task is to implement the `DbToHdfs` gRPC call (you can find t
 2. What are the actual types for those loans? 
 Perform an inner join on these two tables so that a new column `loan_type_name` added to the `loans` table, where its value is the corresponding `loan_type_name` from the `loan_types` table based on the matching `loan_type_id` in `loans`. 
 3. Filter all rows where `loan_amount` is **greater than 30,000** and **less than 800,000**. After filtering, this table should have only **426716** rows.
-4. Upload the generated table to `/hdma-wi-2021.parquet` in the HDFS, with **3x** replication and a **1-MB** block size, using PyArrow (https://arrow.apache.org/docs/python/generated/pyarrow.fs.HadoopFileSystem.html).
+4. Upload the generated table to `/hdma-wi-2021.parquet` in the HDFS, with **2x** replication and a **1-MB** block size, using PyArrow (https://arrow.apache.org/docs/python/generated/pyarrow.fs.HadoopFileSystem.html).
 
 To check whether the upload was correct, you can use `docker exec -it` to enter the gRPC server's container and use HDFS command `hdfs dfs -du -h <path>`to see the file size. The expected result is:
   ```
-  14.4 M  43.2 M  hdfs://boss:9000/hdma-wi-2021.parquet    
+15.3 M  30.5 M  hdfs://nn:9000/hdma-wi-2021.parquet
   ```
 
 **Hint 1:** We used similar tables in lecture: https://git.doit.wisc.edu/cdis/cs/courses/cs544/s25/main/-/tree/main/lec/15-sql
@@ -130,42 +117,45 @@ In this part, your task is to implement the `BlockLocations` gRPC call (you can
 For example, running `docker exec -it p4-server-1 python3 /client.py BlockLocations -f /hdma-wi-2021.parquet` should show something like this:
 
 ```
-{'49a28b8287ad': 16, 'fe1d14755eed': 16, 'b0db22d30950': 16}
+{'7eb74ce67e75': 15, 'f7747b42d254': 6, '39750756065d': 11}
 ```
 
 Note: DataNode location is the randomly generated container ID for the
 container running the DataNode, so yours will be different, and the
-distribution of blocks across different nodes may also vary.
+distribution of blocks across different nodes will also likely vary.
 
 The documents [here](https://hadoop.apache.org/docs/r3.3.6/hadoop-project-dist/hadoop-hdfs/WebHDFS.html) describe how we can interact with HDFS via web requests. Many [examples](https://requests.readthedocs.io/en/latest/user/quickstart/) show these web requests being made with the curl command, but you'll adapt those examples to use requests.get. By default, WebHDFS runs on port 9870. So use port 9870 instead of 9000 to access HDFS for this part.
 
 Use a `GETFILEBLOCKLOCATIONS` operation to find the block locations.
 
-## Part 3: `PartitionByCounty` and `CalcAvgLoan` gRPC Calls
+## Part 3: `CalcAvgLoan` gRPC Call
 
-In this part, your task is to implement the `PartitionByCounty` and `CalcAvgLoan` gRPC calls (you can find the interface definition in the proto file).
+In this part, your task is to implement the `CalcAvgLoan` gRPC call (you can find the interface definition in the proto file).
 
-Imagine a scenario where there could be many queries differentiated by `county`, and one of them is to get the average loan amount for a county. In this case, it might be much more efficient to generate a set of 1x Parquet files filtered by county, and then read data from these partitioned, relatively much smaller tables for computation.
+The call should read hdma-wi-2021.parquet, filtering to rows with the specified county code.  One way to do this would be to pass a `("column", "=", ????)` tuple inside a `filters` list upon read: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html
+
+The call should return the average loan amount from the filtered table.
+
+As an optimization, your code should also write the filtered data to a file named `partitions/<county_code>.parquet`.  If there are later calls for the same county_code, your code should use the smaller, county-specific Parquet file (instead of filtering the big Parquet file with all loan applications).  The county-specific Parquet file should have 1x replication.  When `CalcAvgLoan` returns the average, it should also use the "source" field to indicate whether the data came from big Parquet file (`source="create"` because a new county-specific file had to be created) or a county-specific file was previously created (`source="reuse"`).
 
-**PartitionByCounty:** To be more specific, you need to categorize the contents of that parquet file just stored in HDFS using `county_id` as the key. For each `county_id`, create a new parquet file that records all entries under that county, and then save them with a **1x replication**. Files should be written into folder `/partitioned/` and name for each should be their `county_id`.
+One easy way to check if the county-specific file already exists is to just try reading it with PyArrow.  You should get an `FileNotFoundError` exception if it doesn't exist.
+
+<!--
+Imagine a scenario where there could be many queries differentiated by `county`, and one of them is to get the average loan amount for a county. In this case, it might be much more efficient to generate a set of 1x Parquet files filtered by county, and then read data from these partitioned, relatively much smaller tables for computation.
 
 **CalcAvgLoan:** To be more specific, for a given `county_id` , you need to return a int value, indicating the average `loan_amount` of that county. **Note:** You are required to perform this calculation based on the partitioned parquet files generated by `FilterByCounty`. `source` field in proto file can ignored in this part.
+-->
 
-The inside of the partitioned directory should look like this:
+After a `DbToHdfs` call and a few `CalcAvgLoan` calls, your HDFS directory structure will look something like this:
 
       ```
+      ├── hdma-wi-2021.parquet
       ├── partitioned/
       │   ├── 55001.parquet
       │   ├── 55003.parquet
       │   └── ...
       ```
 
-The root directory on HDFS should now look like this:
-```
-14.4 M  43.2 M  hdfs://boss:9000/hdma-wi-2021.parquet
-19.3 M  19.3 M  hdfs://boss:9000/partitioned
-```
-
 ## Part 4: Fault Tolerance
 
 In this part, your task is to modify the `CalcAvgLoan` gRPC calls you implemented in Part 3.
diff --git a/p4/docker-compose.yml b/p4/docker-compose.yml
index 194370e..18bf3a1 100644
--- a/p4/docker-compose.yml
+++ b/p4/docker-compose.yml
@@ -6,7 +6,7 @@ services:
                deploy:
                        resources:
                                limits:
-                                       memory: 2g
+                                       memory: 3g
 
         mysql:
                 image: ${PROJECT}-mysql
@@ -19,7 +19,7 @@ services:
                 deploy:
                         resources:
                                 limits:
-                                        memory: 2g
+                                        memory: 1g
         nn:
                 image: ${PROJECT}-nn
                 hostname: boss
diff --git a/p4/lender.proto b/p4/lender.proto
index b2cb2b9..d915e9b 100644
--- a/p4/lender.proto
+++ b/p4/lender.proto
@@ -17,11 +17,10 @@ message CalcAvgLoanReq {
 
 message CalcAvgLoanResp {
   int32 avg_loan = 1;
-  string source = 2; // partitioned or unpartitioned?
+  string source = 2; // create, reuse, or recreate
   string error = 3;
 }
 
-
 message StatusString{
   string status= 1;
 }
@@ -29,10 +28,10 @@ message StatusString{
 service Lender {
   //Load input.data from SQL server and upload it to HDFS
   rpc DbToHdfs (Empty) returns (StatusString);
+
   //Get the block locations of the Parquet file in HDFS
   rpc BlockLocations (BlockLocationsReq) returns (BlockLocationsResp);
-  //Classify the data in input.data based on county_code, and save each county_code as a separate Parquet file.
-  rpc PartitionByCounty (Empty) returns (StatusString);
+
   //Calculate the average loan amount for a given county_code
   rpc CalcAvgLoan (CalcAvgLoanReq) returns (CalcAvgLoanResp);
-}
\ No newline at end of file
+}
-- 
GitLab