From e0ce211852c1b61281a844ec99aebc481bb5bdef Mon Sep 17 00:00:00 2001 From: TYLER CARAZA-HARTER <tharter@cs544-tharter.cs.wisc.edu> Date: Wed, 19 Mar 2025 14:22:04 -0500 Subject: [PATCH] wrapup ml demos --- lec/22-spark/nb/lec1.ipynb | 205 +++++++++++++++++++++++++------------ lec/22-spark/nb/lec2.ipynb | 79 +++++++++++++- 2 files changed, 216 insertions(+), 68 deletions(-) diff --git a/lec/22-spark/nb/lec1.ipynb b/lec/22-spark/nb/lec1.ipynb index 68a7739..6c42d14 100644 --- a/lec/22-spark/nb/lec1.ipynb +++ b/lec/22-spark/nb/lec1.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 11, + "execution_count": 2, "id": "c8dca847-54af-4284-97d8-0682e88a6e8d", "metadata": {}, "outputs": [ @@ -12,7 +12,7 @@ "text": [ "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", - "25/03/17 14:23:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + "25/03/19 15:56:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" ] } ], @@ -28,10 +28,21 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "b1664723-73de-4950-8c45-a33af0c07ee6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 3 items\n", + "drwxr-xr-x - root supergroup 0 2025-03-17 14:38 hdfs://nn:9000/model\n", + "drwxr-xr-x - root supergroup 0 2025-03-17 14:26 hdfs://nn:9000/test.parquet\n", + "drwxr-xr-x - root supergroup 0 2025-03-17 14:26 hdfs://nn:9000/train.parquet\n" + ] + } + ], "source": [ "!hdfs dfs -ls hdfs://nn:9000/" ] @@ -3434,7 +3445,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "id": "7dbe60b4-9a2b-4f9c-8e74-88eccaf2bcc6", "metadata": {}, "outputs": [ @@ -3444,7 +3455,7 @@ "DataFrame[x1: double, x2: double, y: double]" ] }, - "execution_count": 12, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -3461,7 +3472,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 4, "id": "4c628bc4-072d-4988-bb21-f4c36ce22883", "metadata": {}, "outputs": [], @@ -3472,17 +3483,24 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 5, "id": "3e387f9f-fb9c-4c03-9ba1-c8345ed13126", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " " + ] + }, { "data": { "text/plain": [ "(76, 24)" ] }, - "execution_count": 24, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -3493,18 +3511,10 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 6, "id": "4c65f3f3-b279-416f-b720-db6c29ab296e", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " " - ] - } - ], + "outputs": [], "source": [ "train.write.format(\"parquet\").mode(\"ignore\").save(\"hdfs://nn:9000/train.parquet\")\n", "test.write.format(\"parquet\").mode(\"ignore\").save(\"hdfs://nn:9000/test.parquet\")" @@ -3512,10 +3522,18 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 7, "id": "2dcc55e4-81d8-4ad3-b7cc-f6830c09888b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " " + ] + } + ], "source": [ "train = spark.read.format(\"parquet\").load(\"hdfs://nn:9000/train.parquet\")\n", "test = spark.read.format(\"parquet\").load(\"hdfs://nn:9000/test.parquet\")" @@ -3523,10 +3541,17 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 8, "id": "e5ff46a6-3bdf-4218-af62-2c34ca00a725", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " " + ] + }, { "name": "stdout", "output_type": "stream", @@ -3550,18 +3575,18 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 10, "id": "d4d8dfa7-3bad-42ce-a1c1-351c91fc3b06", "metadata": {}, "outputs": [], "source": [ "from pyspark.ml.regression import DecisionTreeRegressor # unfit model\n", - "from pyspark.ml.regression import DecisionTreeRegressorModel # fit model" + "from pyspark.ml.regression import DecisionTreeRegressionModel # fit model" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 11, "id": "567b440b-5fcd-426b-80a7-2929105ff145", "metadata": {}, "outputs": [], @@ -3571,10 +3596,17 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 12, "id": "20f6a9a8-ae64-4275-8839-d06aed76ead0", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 9:> (0 + 1) / 1]" + ] + }, { "name": "stdout", "output_type": "stream", @@ -3590,6 +3622,13 @@ "+---+---+------------------+---------+\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " " + ] } ], "source": [ @@ -3599,7 +3638,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 13, "id": "c1f3af58-0781-4686-82c8-c537257595d2", "metadata": {}, "outputs": [ @@ -3618,7 +3657,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 14, "id": "2cb9d1e9-27dc-41f7-8d27-9f73d87cd805", "metadata": {}, "outputs": [ @@ -3629,7 +3668,7 @@ " pyspark.ml.regression.DecisionTreeRegressionModel)" ] }, - "execution_count": 38, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -3640,7 +3679,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 15, "id": "df176bed-374f-4663-b0d3-68fe549afd35", "metadata": {}, "outputs": [ @@ -3668,7 +3707,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 16, "id": "275627c5-ccd1-4b8d-9c7d-7961f2bda773", "metadata": {}, "outputs": [], @@ -3679,7 +3718,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 17, "id": "2ac72a6d-9928-48a0-88b2-90b61f13bafb", "metadata": {}, "outputs": [], @@ -3690,7 +3729,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 18, "id": "1c98e894-a0ea-4919-b664-93fe2079ef4c", "metadata": {}, "outputs": [ @@ -3717,7 +3756,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 19, "id": "eae7eb14-fe65-43d8-9896-980aa63dd7e2", "metadata": {}, "outputs": [ @@ -3725,7 +3764,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "DecisionTreeRegressionModel: uid=DecisionTreeRegressor_a71faf54b217, depth=5, numNodes=51, numFeatures=2\n", + "DecisionTreeRegressionModel: uid=DecisionTreeRegressor_f9d08826ac8b, depth=5, numNodes=51, numFeatures=2\n", " If (feature 0 <= 4.5)\n", " If (feature 0 <= 1.5)\n", " If (feature 1 <= 0.5)\n", @@ -3812,7 +3851,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 20, "id": "72930355-914f-4890-b037-b3d9bc63d9a2", "metadata": {}, "outputs": [ @@ -3830,7 +3869,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 21, "id": "52de063d-3d97-4510-9fc0-3de20bb87e71", "metadata": {}, "outputs": [ @@ -3839,8 +3878,8 @@ "output_type": "stream", "text": [ "Found 2 items\n", - "drwxr-xr-x - root supergroup 0 2025-03-17 14:38 hdfs://nn:9000/model/stages/0_VectorAssembler_f70504ebcb7d\n", - "drwxr-xr-x - root supergroup 0 2025-03-17 14:38 hdfs://nn:9000/model/stages/1_DecisionTreeRegressor_a71faf54b217\n" + "drwxr-xr-x - root supergroup 0 2025-03-19 15:57 hdfs://nn:9000/model/stages/0_VectorAssembler_60e50be1add5\n", + "drwxr-xr-x - root supergroup 0 2025-03-19 15:57 hdfs://nn:9000/model/stages/1_DecisionTreeRegressor_f9d08826ac8b\n" ] } ], @@ -3850,7 +3889,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 22, "id": "ada9e676-df19-4b32-a153-5885d6ca7b98", "metadata": {}, "outputs": [], @@ -3860,7 +3899,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 25, "id": "eb0039d5-0c8b-4bf6-b3b2-1c055f375448", "metadata": {}, "outputs": [ @@ -3868,38 +3907,70 @@ "name": "stdout", "output_type": "stream", "text": [ - "+---+---+------------------+---------+-------------------+\n", - "| x1| x2| y| features| prediction|\n", - "+---+---+------------------+---------+-------------------+\n", - "|0.0|1.0| 1.825532164689176|[0.0,1.0]| 1.4040414970076673|\n", - "|0.0|2.0|2.3378982019998977|[0.0,2.0]| 2.5233870027882084|\n", - "|1.0|0.0|1.6481475984177445|[1.0,0.0]| 1.6492029178176435|\n", - "|1.0|0.0|1.7915461186682566|[1.0,0.0]| 1.6492029178176435|\n", - "|1.0|1.0|2.6028920463022995|[1.0,1.0]| 2.7683675845769957|\n", - "|3.0|2.0| 5.350290084086203|[3.0,2.0]| 6.2509689582932975|\n", - "|6.0|1.0| 7.520628154018157|[6.0,1.0]| 7.576965189743188|\n", - "|8.0|2.0|10.690777404180885|[8.0,2.0]| 10.409642251373787|\n", - "|9.0|1.0|10.406420567186862|[9.0,1.0]| 10.000188475606585|\n", - "|0.0|0.0|0.6282897829715238|(2,[],[])|0.45968856929425694|\n", - "|2.0|1.0| 3.961294224851377|[2.0,1.0]| 3.6707752377825416|\n", - "|4.0|0.0| 4.83443596090177|[4.0,0.0]| 4.317093345475164|\n", - "|5.0|0.0| 5.982564448517017|[5.0,0.0]| 5.527235382083779|\n", - "|5.0|1.0| 6.984174014959795|[5.0,1.0]| 6.315046161076428|\n", - "|6.0|1.0| 7.935274195582237|[6.0,1.0]| 7.576965189743188|\n", - "|8.0|1.0| 9.990683484052727|[8.0,1.0]| 10.000188475606585|\n", - "|2.0|1.0|3.8856373417168104|[2.0,1.0]| 3.6707752377825416|\n", - "|2.0|2.0| 4.823994914064544|[2.0,2.0]| 4.6983808565300755|\n", - "|6.0|2.0| 8.555804264666113|[6.0,2.0]| 8.233809052748645|\n", - "|8.0|0.0| 8.59804463859586|[8.0,0.0]| 8.525778987202642|\n", - "+---+---+------------------+---------+-------------------+\n", - "only showing top 20 rows\n", + "+---+---+------------------+---------+------------------+\n", + "| x1| x2| y| features| prediction|\n", + "+---+---+------------------+---------+------------------+\n", + "|0.0|1.0| 1.825532164689176|[0.0,1.0]|1.4040414970076673|\n", + "|0.0|2.0|2.3378982019998977|[0.0,2.0]|2.5233870027882084|\n", + "|1.0|0.0|1.6481475984177445|[1.0,0.0]|1.6492029178176435|\n", + "+---+---+------------------+---------+------------------+\n", + "only showing top 3 rows\n", "\n" ] } ], "source": [ - "model.transform(test).show()" + "model.transform(test).show(3)" ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "eca96811-5057-4724-bab0-76424985a950", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.ml.evaluation import RegressionEvaluator" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "295527ed-c180-4124-84d6-8832cc26b9f4", + "metadata": {}, + "outputs": [], + "source": [ + "r2score = RegressionEvaluator(predictionCol=\"prediction\", labelCol=\"y\", metricName=\"r2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "81923380-c618-44c4-8bb9-00949c6bb27c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9838453310282415" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2score.evaluate(model.transform(test))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4005bcd1-7429-4454-bdae-eee826f07de4", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/lec/22-spark/nb/lec2.ipynb b/lec/22-spark/nb/lec2.ipynb index 09af800..5bdcf21 100644 --- a/lec/22-spark/nb/lec2.ipynb +++ b/lec/22-spark/nb/lec2.ipynb @@ -3753,7 +3753,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 67, "id": "f0a89e75-9aae-4784-9dab-67fec0e8e152", "metadata": {}, "outputs": [ @@ -3845,6 +3845,83 @@ "source": [ "print(model.stages[1].toDebugString)" ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "4270c993-4813-4b61-a23d-6b4eb0531a6d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+---+------------------+---------+------------------+\n", + "| x1| x2| y| features| prediction|\n", + "+---+---+------------------+---------+------------------+\n", + "|0.0|1.0| 1.825532164689176|[0.0,1.0]|1.4040414970076673|\n", + "|0.0|2.0|2.3378982019998977|[0.0,2.0]|2.5233870027882084|\n", + "|1.0|0.0|1.6481475984177445|[1.0,0.0]|1.6492029178176435|\n", + "+---+---+------------------+---------+------------------+\n", + "\n" + ] + } + ], + "source": [ + "model.transform(test).limit(3).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "033be49b-6a44-4112-846a-bdff23278659", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.ml.evaluation import RegressionEvaluator" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "993fd18e-1092-4be5-b7ec-8cae34f09c96", + "metadata": {}, + "outputs": [], + "source": [ + "r2score = RegressionEvaluator(\n", + " predictionCol=\"prediction\",\n", + " labelCol=\"y\",\n", + " metricName=\"r2\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "fb286b37-201d-4eb5-8e84-d406824c8369", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " " + ] + }, + { + "data": { + "text/plain": [ + "0.9838453310282415" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2score.evaluate(model.transform(test))" + ] } ], "metadata": { -- GitLab