From e0ce211852c1b61281a844ec99aebc481bb5bdef Mon Sep 17 00:00:00 2001
From: TYLER CARAZA-HARTER <tharter@cs544-tharter.cs.wisc.edu>
Date: Wed, 19 Mar 2025 14:22:04 -0500
Subject: [PATCH] wrapup ml demos

---
 lec/22-spark/nb/lec1.ipynb | 205 +++++++++++++++++++++++++------------
 lec/22-spark/nb/lec2.ipynb |  79 +++++++++++++-
 2 files changed, 216 insertions(+), 68 deletions(-)

diff --git a/lec/22-spark/nb/lec1.ipynb b/lec/22-spark/nb/lec1.ipynb
index 68a7739..6c42d14 100644
--- a/lec/22-spark/nb/lec1.ipynb
+++ b/lec/22-spark/nb/lec1.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 2,
    "id": "c8dca847-54af-4284-97d8-0682e88a6e8d",
    "metadata": {},
    "outputs": [
@@ -12,7 +12,7 @@
      "text": [
       "Setting default log level to \"WARN\".\n",
       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
-      "25/03/17 14:23:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
+      "25/03/19 15:56:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
      ]
     }
    ],
@@ -28,10 +28,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "b1664723-73de-4950-8c45-a33af0c07ee6",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 3 items\n",
+      "drwxr-xr-x   - root supergroup          0 2025-03-17 14:38 hdfs://nn:9000/model\n",
+      "drwxr-xr-x   - root supergroup          0 2025-03-17 14:26 hdfs://nn:9000/test.parquet\n",
+      "drwxr-xr-x   - root supergroup          0 2025-03-17 14:26 hdfs://nn:9000/train.parquet\n"
+     ]
+    }
+   ],
    "source": [
     "!hdfs dfs -ls hdfs://nn:9000/"
    ]
@@ -3434,7 +3445,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 3,
    "id": "7dbe60b4-9a2b-4f9c-8e74-88eccaf2bcc6",
    "metadata": {},
    "outputs": [
@@ -3444,7 +3455,7 @@
        "DataFrame[x1: double, x2: double, y: double]"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3461,7 +3472,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 4,
    "id": "4c628bc4-072d-4988-bb21-f4c36ce22883",
    "metadata": {},
    "outputs": [],
@@ -3472,17 +3483,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 5,
    "id": "3e387f9f-fb9c-4c03-9ba1-c8345ed13126",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                "
+     ]
+    },
     {
      "data": {
       "text/plain": [
        "(76, 24)"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3493,18 +3511,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 6,
    "id": "4c65f3f3-b279-416f-b720-db6c29ab296e",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                                                                "
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "train.write.format(\"parquet\").mode(\"ignore\").save(\"hdfs://nn:9000/train.parquet\")\n",
     "test.write.format(\"parquet\").mode(\"ignore\").save(\"hdfs://nn:9000/test.parquet\")"
@@ -3512,10 +3522,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 7,
    "id": "2dcc55e4-81d8-4ad3-b7cc-f6830c09888b",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                "
+     ]
+    }
+   ],
    "source": [
     "train = spark.read.format(\"parquet\").load(\"hdfs://nn:9000/train.parquet\")\n",
     "test = spark.read.format(\"parquet\").load(\"hdfs://nn:9000/test.parquet\")"
@@ -3523,10 +3541,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 8,
    "id": "e5ff46a6-3bdf-4218-af62-2c34ca00a725",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                "
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -3550,18 +3575,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 10,
    "id": "d4d8dfa7-3bad-42ce-a1c1-351c91fc3b06",
    "metadata": {},
    "outputs": [],
    "source": [
     "from pyspark.ml.regression import DecisionTreeRegressor       # unfit model\n",
-    "from pyspark.ml.regression import DecisionTreeRegressorModel  # fit model"
+    "from pyspark.ml.regression import DecisionTreeRegressionModel  # fit model"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 11,
    "id": "567b440b-5fcd-426b-80a7-2929105ff145",
    "metadata": {},
    "outputs": [],
@@ -3571,10 +3596,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 12,
    "id": "20f6a9a8-ae64-4275-8839-d06aed76ead0",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Stage 9:>                                                          (0 + 1) / 1]"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -3590,6 +3622,13 @@
       "+---+---+------------------+---------+\n",
       "\n"
      ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                "
+     ]
     }
    ],
    "source": [
@@ -3599,7 +3638,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 13,
    "id": "c1f3af58-0781-4686-82c8-c537257595d2",
    "metadata": {},
    "outputs": [
@@ -3618,7 +3657,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 14,
    "id": "2cb9d1e9-27dc-41f7-8d27-9f73d87cd805",
    "metadata": {},
    "outputs": [
@@ -3629,7 +3668,7 @@
        " pyspark.ml.regression.DecisionTreeRegressionModel)"
       ]
      },
-     "execution_count": 38,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3640,7 +3679,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 15,
    "id": "df176bed-374f-4663-b0d3-68fe549afd35",
    "metadata": {},
    "outputs": [
@@ -3668,7 +3707,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 16,
    "id": "275627c5-ccd1-4b8d-9c7d-7961f2bda773",
    "metadata": {},
    "outputs": [],
@@ -3679,7 +3718,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 17,
    "id": "2ac72a6d-9928-48a0-88b2-90b61f13bafb",
    "metadata": {},
    "outputs": [],
@@ -3690,7 +3729,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 18,
    "id": "1c98e894-a0ea-4919-b664-93fe2079ef4c",
    "metadata": {},
    "outputs": [
@@ -3717,7 +3756,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 19,
    "id": "eae7eb14-fe65-43d8-9896-980aa63dd7e2",
    "metadata": {},
    "outputs": [
@@ -3725,7 +3764,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "DecisionTreeRegressionModel: uid=DecisionTreeRegressor_a71faf54b217, depth=5, numNodes=51, numFeatures=2\n",
+      "DecisionTreeRegressionModel: uid=DecisionTreeRegressor_f9d08826ac8b, depth=5, numNodes=51, numFeatures=2\n",
       "  If (feature 0 <= 4.5)\n",
       "   If (feature 0 <= 1.5)\n",
       "    If (feature 1 <= 0.5)\n",
@@ -3812,7 +3851,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 20,
    "id": "72930355-914f-4890-b037-b3d9bc63d9a2",
    "metadata": {},
    "outputs": [
@@ -3830,7 +3869,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 21,
    "id": "52de063d-3d97-4510-9fc0-3de20bb87e71",
    "metadata": {},
    "outputs": [
@@ -3839,8 +3878,8 @@
      "output_type": "stream",
      "text": [
       "Found 2 items\n",
-      "drwxr-xr-x   - root supergroup          0 2025-03-17 14:38 hdfs://nn:9000/model/stages/0_VectorAssembler_f70504ebcb7d\n",
-      "drwxr-xr-x   - root supergroup          0 2025-03-17 14:38 hdfs://nn:9000/model/stages/1_DecisionTreeRegressor_a71faf54b217\n"
+      "drwxr-xr-x   - root supergroup          0 2025-03-19 15:57 hdfs://nn:9000/model/stages/0_VectorAssembler_60e50be1add5\n",
+      "drwxr-xr-x   - root supergroup          0 2025-03-19 15:57 hdfs://nn:9000/model/stages/1_DecisionTreeRegressor_f9d08826ac8b\n"
      ]
     }
    ],
@@ -3850,7 +3889,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 22,
    "id": "ada9e676-df19-4b32-a153-5885d6ca7b98",
    "metadata": {},
    "outputs": [],
@@ -3860,7 +3899,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": 25,
    "id": "eb0039d5-0c8b-4bf6-b3b2-1c055f375448",
    "metadata": {},
    "outputs": [
@@ -3868,38 +3907,70 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "+---+---+------------------+---------+-------------------+\n",
-      "| x1| x2|                 y| features|         prediction|\n",
-      "+---+---+------------------+---------+-------------------+\n",
-      "|0.0|1.0| 1.825532164689176|[0.0,1.0]| 1.4040414970076673|\n",
-      "|0.0|2.0|2.3378982019998977|[0.0,2.0]| 2.5233870027882084|\n",
-      "|1.0|0.0|1.6481475984177445|[1.0,0.0]| 1.6492029178176435|\n",
-      "|1.0|0.0|1.7915461186682566|[1.0,0.0]| 1.6492029178176435|\n",
-      "|1.0|1.0|2.6028920463022995|[1.0,1.0]| 2.7683675845769957|\n",
-      "|3.0|2.0| 5.350290084086203|[3.0,2.0]| 6.2509689582932975|\n",
-      "|6.0|1.0| 7.520628154018157|[6.0,1.0]|  7.576965189743188|\n",
-      "|8.0|2.0|10.690777404180885|[8.0,2.0]| 10.409642251373787|\n",
-      "|9.0|1.0|10.406420567186862|[9.0,1.0]| 10.000188475606585|\n",
-      "|0.0|0.0|0.6282897829715238|(2,[],[])|0.45968856929425694|\n",
-      "|2.0|1.0| 3.961294224851377|[2.0,1.0]| 3.6707752377825416|\n",
-      "|4.0|0.0|  4.83443596090177|[4.0,0.0]|  4.317093345475164|\n",
-      "|5.0|0.0| 5.982564448517017|[5.0,0.0]|  5.527235382083779|\n",
-      "|5.0|1.0| 6.984174014959795|[5.0,1.0]|  6.315046161076428|\n",
-      "|6.0|1.0| 7.935274195582237|[6.0,1.0]|  7.576965189743188|\n",
-      "|8.0|1.0| 9.990683484052727|[8.0,1.0]| 10.000188475606585|\n",
-      "|2.0|1.0|3.8856373417168104|[2.0,1.0]| 3.6707752377825416|\n",
-      "|2.0|2.0| 4.823994914064544|[2.0,2.0]| 4.6983808565300755|\n",
-      "|6.0|2.0| 8.555804264666113|[6.0,2.0]|  8.233809052748645|\n",
-      "|8.0|0.0|  8.59804463859586|[8.0,0.0]|  8.525778987202642|\n",
-      "+---+---+------------------+---------+-------------------+\n",
-      "only showing top 20 rows\n",
+      "+---+---+------------------+---------+------------------+\n",
+      "| x1| x2|                 y| features|        prediction|\n",
+      "+---+---+------------------+---------+------------------+\n",
+      "|0.0|1.0| 1.825532164689176|[0.0,1.0]|1.4040414970076673|\n",
+      "|0.0|2.0|2.3378982019998977|[0.0,2.0]|2.5233870027882084|\n",
+      "|1.0|0.0|1.6481475984177445|[1.0,0.0]|1.6492029178176435|\n",
+      "+---+---+------------------+---------+------------------+\n",
+      "only showing top 3 rows\n",
       "\n"
      ]
     }
    ],
    "source": [
-    "model.transform(test).show()"
+    "model.transform(test).show(3)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "eca96811-5057-4724-bab0-76424985a950",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyspark.ml.evaluation import RegressionEvaluator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "295527ed-c180-4124-84d6-8832cc26b9f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "r2score = RegressionEvaluator(predictionCol=\"prediction\", labelCol=\"y\", metricName=\"r2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "81923380-c618-44c4-8bb9-00949c6bb27c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9838453310282415"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "r2score.evaluate(model.transform(test))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4005bcd1-7429-4454-bdae-eee826f07de4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/lec/22-spark/nb/lec2.ipynb b/lec/22-spark/nb/lec2.ipynb
index 09af800..5bdcf21 100644
--- a/lec/22-spark/nb/lec2.ipynb
+++ b/lec/22-spark/nb/lec2.ipynb
@@ -3753,7 +3753,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 67,
    "id": "f0a89e75-9aae-4784-9dab-67fec0e8e152",
    "metadata": {},
    "outputs": [
@@ -3845,6 +3845,83 @@
    "source": [
     "print(model.stages[1].toDebugString)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "4270c993-4813-4b61-a23d-6b4eb0531a6d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+---+---+------------------+---------+------------------+\n",
+      "| x1| x2|                 y| features|        prediction|\n",
+      "+---+---+------------------+---------+------------------+\n",
+      "|0.0|1.0| 1.825532164689176|[0.0,1.0]|1.4040414970076673|\n",
+      "|0.0|2.0|2.3378982019998977|[0.0,2.0]|2.5233870027882084|\n",
+      "|1.0|0.0|1.6481475984177445|[1.0,0.0]|1.6492029178176435|\n",
+      "+---+---+------------------+---------+------------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.transform(test).limit(3).show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "033be49b-6a44-4112-846a-bdff23278659",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyspark.ml.evaluation import RegressionEvaluator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "id": "993fd18e-1092-4be5-b7ec-8cae34f09c96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "r2score = RegressionEvaluator(\n",
+    "    predictionCol=\"prediction\",\n",
+    "    labelCol=\"y\",\n",
+    "    metricName=\"r2\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "id": "fb286b37-201d-4eb5-8e84-d406824c8369",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                "
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.9838453310282415"
+      ]
+     },
+     "execution_count": 72,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "r2score.evaluate(model.transform(test))"
+   ]
   }
  ],
  "metadata": {
-- 
GitLab