Skip to content
Snippets Groups Projects
Commit 67240c87 authored by TYLER CARAZA-HARTER's avatar TYLER CARAZA-HARTER
Browse files

lec 22 setup

parent cab26261
No related branches found
No related tags found
No related merge requests found
FROM ubuntu:24.04 FROM ubuntu:24.04
RUN apt-get update; apt-get install -y wget curl openjdk-11-jdk python3-pip net-tools unzip RUN apt-get update; apt-get install -y wget curl openjdk-11-jdk python3-pip nano
# SPARK # SPARK
RUN wget https://dlcdn.apache.org/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz && tar -xf spark-3.5.3-bin-hadoop3.tgz && rm spark-3.5.3-bin-hadoop3.tgz RUN wget https://archive.apache.org/dist/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz && tar -xf spark-3.5.5-bin-hadoop3.tgz && rm spark-3.5.5-bin-hadoop3.tgz
# HDFS # HDFS
RUN wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz && tar -xf hadoop-3.3.6.tar.gz && rm hadoop-3.3.6.tar.gz RUN wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz && tar -xf hadoop-3.3.6.tar.gz && rm hadoop-3.3.6.tar.gz
# Jupyter # Jupyter
RUN pip3 install jupyterlab==4.0.3 pandas==2.2.3 pyspark==3.5.3 matplotlib --break-system-packages RUN pip3 install jupyterlab==4.3.5 pandas==2.2.3 pyspark==3.5.5 matplotlib==3.10.1 --break-system-packages
ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
ENV PATH="${PATH}:/hadoop-3.3.6/bin" ENV PATH="${PATH}:/hadoop-3.3.6/bin"
......
...@@ -20,10 +20,10 @@ services: ...@@ -20,10 +20,10 @@ services:
spark-boss: spark-boss:
image: spark-demo image: spark-demo
hostname: boss hostname: boss
command: sh -c "/spark-3.5.3-bin-hadoop3/sbin/start-master.sh && sleep infinity" command: sh -c "/spark-3.5.5-bin-hadoop3/sbin/start-master.sh && sleep infinity"
spark-worker: spark-worker:
image: spark-demo image: spark-demo
command: sh -c "/spark-3.5.3-bin-hadoop3/sbin/start-worker.sh spark://boss:7077 -c 1 -m 512M && sleep infinity" command: sh -c "/spark-3.5.5-bin-hadoop3/sbin/start-worker.sh spark://boss:7077 -c 2 -m 2g && sleep infinity"
deploy: deploy:
replicas: 2 replicas: 2
%% Cell type:code id:c8dca847-54af-4284-97d8-0682e88a6e8d tags: %% Cell type:code id:c8dca847-54af-4284-97d8-0682e88a6e8d tags:
``` python ``` python
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("cs544") spark = (SparkSession.builder.appName("cs544")
.master("spark://boss:7077") .master("spark://boss:7077")
.config("spark.executor.memory", "512M") .config("spark.executor.memory", "2G")
.config("spark.sql.warehouse.dir", "hdfs://nn:9000/user/hive/warehouse") .config("spark.sql.warehouse.dir", "hdfs://nn:9000/user/hive/warehouse")
.enableHiveSupport() .enableHiveSupport()
.getOrCreate()) .getOrCreate())
``` ```
%% Output %% Output
Setting default log level to "WARN". Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/27 01:41:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 23/10/27 01:41:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
%% Cell type:code id:2294e4e0-ab19-496c-980f-31df757e7837 tags: %% Cell type:code id:2294e4e0-ab19-496c-980f-31df757e7837 tags:
``` python ``` python
!hdfs dfs -cp sf.csv hdfs://nn:9000/sf.csv !hdfs dfs -cp sf.csv hdfs://nn:9000/sf.csv
``` ```
%% Cell type:code id:cb54bacc-b52a-4c25-93d2-2ba0f61de9b0 tags: %% Cell type:code id:cb54bacc-b52a-4c25-93d2-2ba0f61de9b0 tags:
``` python ``` python
df = (spark.read.format("csv") df = (spark.read.format("csv")
.option("header", True) .option("header", True)
.option("inferSchema", True) .option("inferSchema", True)
.load("hdfs://nn:9000/sf.csv")) .load("hdfs://nn:9000/sf.csv"))
``` ```
%% Output %% Output
%% Cell type:code id:c1298818-83f6-444b-b8a0-4be5b16fd6fb tags: %% Cell type:code id:c1298818-83f6-444b-b8a0-4be5b16fd6fb tags:
``` python ``` python
from pyspark.sql.functions import col, expr from pyspark.sql.functions import col, expr
cols = [col(c).alias(c.replace(" ", "_")) for c in df.columns] cols = [col(c).alias(c.replace(" ", "_")) for c in df.columns]
df.select(cols).write.format("parquet").save("hdfs://nn:9000/sf.parquet") df.select(cols).write.format("parquet").save("hdfs://nn:9000/sf.parquet")
``` ```
%% Output %% Output
23/10/27 01:43:57 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'. 23/10/27 01:43:57 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
%% Cell type:code id:37d1ded3-ed8a-4e39-94cb-dd3a3272af91 tags: %% Cell type:code id:37d1ded3-ed8a-4e39-94cb-dd3a3272af91 tags:
``` python ``` python
!hdfs dfs -rm hdfs://nn:9000/sf.csv !hdfs dfs -rm hdfs://nn:9000/sf.csv
``` ```
%% Cell type:code id:abea48b5-e012-4ae2-a53a-e40350f94e20 tags: %% Cell type:code id:abea48b5-e012-4ae2-a53a-e40350f94e20 tags:
``` python ``` python
df = spark.read.format("parquet").load("hdfs://nn:9000/sf.parquet") df = spark.read.format("parquet").load("hdfs://nn:9000/sf.parquet")
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment