Skip to content
Snippets Groups Projects
Commit 67240c87 authored by TYLER CARAZA-HARTER's avatar TYLER CARAZA-HARTER
Browse files

lec 22 setup

parent cab26261
No related branches found
No related tags found
No related merge requests found
FROM ubuntu:24.04
RUN apt-get update; apt-get install -y wget curl openjdk-11-jdk python3-pip net-tools unzip
RUN apt-get update; apt-get install -y wget curl openjdk-11-jdk python3-pip nano
# SPARK
RUN wget https://dlcdn.apache.org/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz && tar -xf spark-3.5.3-bin-hadoop3.tgz && rm spark-3.5.3-bin-hadoop3.tgz
RUN wget https://archive.apache.org/dist/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz && tar -xf spark-3.5.5-bin-hadoop3.tgz && rm spark-3.5.5-bin-hadoop3.tgz
# HDFS
RUN wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz && tar -xf hadoop-3.3.6.tar.gz && rm hadoop-3.3.6.tar.gz
# Jupyter
RUN pip3 install jupyterlab==4.0.3 pandas==2.2.3 pyspark==3.5.3 matplotlib --break-system-packages
RUN pip3 install jupyterlab==4.3.5 pandas==2.2.3 pyspark==3.5.5 matplotlib==3.10.1 --break-system-packages
ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
ENV PATH="${PATH}:/hadoop-3.3.6/bin"
......
......@@ -20,10 +20,10 @@ services:
spark-boss:
image: spark-demo
hostname: boss
command: sh -c "/spark-3.5.3-bin-hadoop3/sbin/start-master.sh && sleep infinity"
command: sh -c "/spark-3.5.5-bin-hadoop3/sbin/start-master.sh && sleep infinity"
spark-worker:
image: spark-demo
command: sh -c "/spark-3.5.3-bin-hadoop3/sbin/start-worker.sh spark://boss:7077 -c 1 -m 512M && sleep infinity"
command: sh -c "/spark-3.5.5-bin-hadoop3/sbin/start-worker.sh spark://boss:7077 -c 2 -m 2g && sleep infinity"
deploy:
replicas: 2
%% Cell type:code id:c8dca847-54af-4284-97d8-0682e88a6e8d tags:
``` python
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("cs544")
.master("spark://boss:7077")
.config("spark.executor.memory", "512M")
.config("spark.executor.memory", "2G")
.config("spark.sql.warehouse.dir", "hdfs://nn:9000/user/hive/warehouse")
.enableHiveSupport()
.getOrCreate())
```
%% Output
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/27 01:41:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
%% Cell type:code id:2294e4e0-ab19-496c-980f-31df757e7837 tags:
``` python
!hdfs dfs -cp sf.csv hdfs://nn:9000/sf.csv
```
%% Cell type:code id:cb54bacc-b52a-4c25-93d2-2ba0f61de9b0 tags:
``` python
df = (spark.read.format("csv")
.option("header", True)
.option("inferSchema", True)
.load("hdfs://nn:9000/sf.csv"))
```
%% Output
%% Cell type:code id:c1298818-83f6-444b-b8a0-4be5b16fd6fb tags:
``` python
from pyspark.sql.functions import col, expr
cols = [col(c).alias(c.replace(" ", "_")) for c in df.columns]
df.select(cols).write.format("parquet").save("hdfs://nn:9000/sf.parquet")
```
%% Output
23/10/27 01:43:57 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
%% Cell type:code id:37d1ded3-ed8a-4e39-94cb-dd3a3272af91 tags:
``` python
!hdfs dfs -rm hdfs://nn:9000/sf.csv
```
%% Cell type:code id:abea48b5-e012-4ae2-a53a-e40350f94e20 tags:
``` python
df = spark.read.format("parquet").load("hdfs://nn:9000/sf.parquet")
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment