From 67240c8784f2e2cf5e7122d2b8768bb59416db8b Mon Sep 17 00:00:00 2001
From: TYLER CARAZA-HARTER <tharter@cs544-tharter.cs.wisc.edu>
Date: Wed, 12 Mar 2025 10:03:44 -0500
Subject: [PATCH] lec 22 setup

---
 lec/22-spark/Dockerfile         | 8 ++++----
 lec/22-spark/docker-compose.yml | 4 ++--
 lec/22-spark/nb/starter.ipynb   | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lec/22-spark/Dockerfile b/lec/22-spark/Dockerfile
index 23ff9b3..7a58197 100644
--- a/lec/22-spark/Dockerfile
+++ b/lec/22-spark/Dockerfile
@@ -1,14 +1,14 @@
 FROM ubuntu:24.04
-RUN apt-get update; apt-get install -y wget curl openjdk-11-jdk python3-pip net-tools unzip
+RUN apt-get update; apt-get install -y wget curl openjdk-11-jdk python3-pip nano
 
 # SPARK
-RUN wget https://dlcdn.apache.org/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz && tar -xf spark-3.5.3-bin-hadoop3.tgz && rm spark-3.5.3-bin-hadoop3.tgz
-
+RUN wget https://archive.apache.org/dist/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz && tar -xf spark-3.5.5-bin-hadoop3.tgz && rm spark-3.5.5-bin-hadoop3.tgz
+         
 # HDFS
 RUN wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz && tar -xf hadoop-3.3.6.tar.gz && rm hadoop-3.3.6.tar.gz
 
 # Jupyter
-RUN pip3 install jupyterlab==4.0.3 pandas==2.2.3 pyspark==3.5.3 matplotlib --break-system-packages
+RUN pip3 install jupyterlab==4.3.5 pandas==2.2.3 pyspark==3.5.5 matplotlib==3.10.1 --break-system-packages
 
 ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
 ENV PATH="${PATH}:/hadoop-3.3.6/bin"
diff --git a/lec/22-spark/docker-compose.yml b/lec/22-spark/docker-compose.yml
index 9189a8f..6e7a37e 100644
--- a/lec/22-spark/docker-compose.yml
+++ b/lec/22-spark/docker-compose.yml
@@ -20,10 +20,10 @@ services:
     spark-boss:
         image: spark-demo
         hostname: boss
-        command: sh -c "/spark-3.5.3-bin-hadoop3/sbin/start-master.sh && sleep infinity"
+        command: sh -c "/spark-3.5.5-bin-hadoop3/sbin/start-master.sh && sleep infinity"
 
     spark-worker:
         image: spark-demo
-        command: sh -c "/spark-3.5.3-bin-hadoop3/sbin/start-worker.sh spark://boss:7077 -c 1 -m 512M && sleep infinity"
+        command: sh -c "/spark-3.5.5-bin-hadoop3/sbin/start-worker.sh spark://boss:7077 -c 2 -m 2g && sleep infinity"
         deploy:
                 replicas: 2
diff --git a/lec/22-spark/nb/starter.ipynb b/lec/22-spark/nb/starter.ipynb
index 8930462..6a085d2 100644
--- a/lec/22-spark/nb/starter.ipynb
+++ b/lec/22-spark/nb/starter.ipynb
@@ -20,7 +20,7 @@
     "from pyspark.sql import SparkSession\n",
     "spark = (SparkSession.builder.appName(\"cs544\")\n",
     "         .master(\"spark://boss:7077\")\n",
-    "         .config(\"spark.executor.memory\", \"512M\")\n",
+    "         .config(\"spark.executor.memory\", \"2G\")\n",
     "         .config(\"spark.sql.warehouse.dir\", \"hdfs://nn:9000/user/hive/warehouse\")\n",
     "         .enableHiveSupport()\n",
     "         .getOrCreate())"
@@ -115,7 +115,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,
-- 
GitLab