Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • cdis/cs/courses/cs544/s25/main
  • zzhang2478/main
  • spark667/main
  • vijayprabhak/main
  • vijayprabhak/544-main
  • wyang338/cs-544-s-25
  • jmin39/main
7 results
Show changes
Commits on Source (43)
Showing
with 11684 additions and 63 deletions
File added
File added
File added
File added
File added
File added
File added
File added
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
FROM ubuntu:24.04
RUN apt-get update; apt-get install -y wget curl openjdk-11-jdk python3-pip nano
# SPARK
RUN wget https://archive.apache.org/dist/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz && tar -xf spark-3.5.5-bin-hadoop3.tgz && rm spark-3.5.5-bin-hadoop3.tgz
# HDFS
RUN wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz && tar -xf hadoop-3.3.6.tar.gz && rm hadoop-3.3.6.tar.gz
# Jupyter
RUN pip3 install jupyterlab==4.3.5 pandas==2.2.3 pyspark==3.5.5 --break-system-packages
ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
ENV PATH="${PATH}:/hadoop-3.3.6/bin"
ENV HADOOP_HOME=/hadoop-3.3.6
services:
nb:
image: spark-demo
ports:
- "127.0.0.1:5000:5000"
- "127.0.0.1:4040:4040"
volumes:
- "./nb:/nb"
command: python3 -m jupyterlab --no-browser --ip=0.0.0.0 --port=5000 --allow-root --NotebookApp.token=''
nn:
image: spark-demo
hostname: nn
command: sh -c "hdfs namenode -format -force && hdfs namenode -D dfs.replication=1 -fs hdfs://nn:9000"
dn:
image: spark-demo
command: hdfs datanode -fs hdfs://nn:9000
spark-boss:
image: spark-demo
hostname: boss
command: sh -c "/spark-3.5.5-bin-hadoop3/sbin/start-master.sh && sleep infinity"
spark-worker:
image: spark-demo
command: sh -c "/spark-3.5.5-bin-hadoop3/sbin/start-worker.sh spark://boss:7077 -c 2 -m 2g && sleep infinity"
deploy:
replicas: 2
This diff is collapsed.
FROM ubuntu:24.04
RUN apt-get update; apt-get install -y wget curl openjdk-11-jdk python3-pip nano
# SPARK
#RUN wget https://archive.apache.org/dist/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz && tar -xf spark-3.5.5-bin-hadoop3.tgz && rm spark-3.5.5-bin-hadoop3.tgz
RUN wget https://dlcdn.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz && tar -xf spark-3.5.5-bin-hadoop3.tgz && rm spark-3.5.5-bin-hadoop3.tgz
# HDFS
RUN wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz && tar -xf hadoop-3.3.6.tar.gz && rm hadoop-3.3.6.tar.gz
# Jupyter
RUN pip3 install jupyterlab==4.3.5 pandas==2.2.3 pyspark==3.5.5 matplotlib==3.10.1 --break-system-packages
ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
ENV PATH="${PATH}:/hadoop-3.3.6/bin"
ENV HADOOP_HOME=/hadoop-3.3.6
services:
nb:
image: spark-demo
ports:
- "127.0.0.1:5000:5000"
- "127.0.0.1:4040:4040"
volumes:
- "./nb:/nb"
command: python3 -m jupyterlab --no-browser --ip=0.0.0.0 --port=5000 --allow-root --NotebookApp.token=''
nn:
image: spark-demo
hostname: nn
command: sh -c "hdfs namenode -format -force && hdfs namenode -D dfs.replication=1 -fs hdfs://nn:9000"
dn:
image: spark-demo
command: hdfs datanode -fs hdfs://nn:9000
spark-boss:
image: spark-demo
hostname: boss
command: sh -c "/spark-3.5.5-bin-hadoop3/sbin/start-master.sh && sleep infinity"
spark-worker:
image: spark-demo
command: sh -c "/spark-3.5.5-bin-hadoop3/sbin/start-worker.sh spark://boss:7077 -c 2 -m 2g && sleep infinity"
deploy:
replicas: 2
date,holiday
01/01/2013,New Year's Day
01/01/2014,New Year's Day
01/01/2015,New Year's Day
01/01/2016,New Year's Day
01/01/2018,New Year's Day
01/01/2019,New Year's Day
01/01/2020,New Year's Day
01/01/2021,New Year's Day
01/02/2012,New Year's Day
01/02/2017,New Year's Day
01/15/2018,"Birthday of Martin Luther King, Jr."
01/16/2012,"Birthday of Martin Luther King, Jr."
01/16/2017,"Birthday of Martin Luther King, Jr."
01/17/2011,"Birthday of Martin Luther King, Jr."
01/17/2022,"Birthday of Martin Luther King, Jr."
01/18/2016,"Birthday of Martin Luther King, Jr."
01/18/2021,"Birthday of Martin Luther King, Jr."
01/19/2015,"Birthday of Martin Luther King, Jr."
01/20/2014,"Birthday of Martin Luther King, Jr."
01/20/2020,"Birthday of Martin Luther King, Jr."
01/20/2021,Inauguration Day
01/21/2013,"Birthday of Martin Luther King, Jr."
01/21/2019,"Birthday of Martin Luther King, Jr."
02/15/2016,Washington's Birthday
02/15/2021,Washington's Birthday
02/16/2015,Washington's Birthday
02/17/2014,Washington's Birthday
02/17/2020,Washington's Birthday
02/18/2013,Washington's Birthday
02/18/2019,Washington's Birthday
02/19/2018,Washington's Birthday
02/20/2012,Washington's Birthday
02/20/2017,Washington's Birthday
02/21/2011,Washington's Birthday
02/21/2022,Washington's Birthday
05/25/2015,Memorial Day
05/25/2020,Memorial Day
05/26/2014,Memorial Day
05/27/2013,Memorial Day
05/27/2019,Memorial Day
05/28/2012,Memorial Day
05/28/2018,Memorial Day
05/29/2017,Memorial Day
05/30/2011,Memorial Day
05/30/2016,Memorial Day
05/30/2022,Memorial Day
05/31/2021,Memorial Day
06/18/2021,Juneteenth National Independence Day
06/20/2022,Juneteenth National Independence Day
07/03/2015,Independence Day
07/03/2020,Independence Day
07/04/2011,Independence Day
07/04/2012,Independence Day
07/04/2013,Independence Day
07/04/2014,Independence Day
07/04/2016,Independence Day
07/04/2017,Independence Day
07/04/2018,Independence Day
07/04/2019,Independence Day
07/04/2022,Independence Day
07/05/2021,Independence Day
09/01/2014,Labor Day
09/02/2013,Labor Day
09/02/2019,Labor Day
09/03/2012,Labor Day
09/03/2018,Labor Day
09/04/2017,Labor Day
09/05/2011,Labor Day
09/05/2016,Labor Day
09/05/2022,Labor Day
09/06/2021,Labor Day
09/07/2015,Labor Day
09/07/2020,Labor Day
10/08/2012,Columbus Day
10/08/2018,Columbus Day
10/09/2017,Columbus Day
10/10/2011,Columbus Day
10/10/2016,Columbus Day
10/10/2022,Columbus Day
10/11/2021,Columbus Day
10/12/2015,Columbus Day
10/12/2020,Columbus Day
10/13/2014,Columbus Day
10/14/2013,Columbus Day
10/14/2019,Columbus Day
11/10/2017,Veterans Day
11/11/2011,Veterans Day
11/11/2013,Veterans Day
11/11/2014,Veterans Day
11/11/2015,Veterans Day
11/11/2016,Veterans Day
11/11/2019,Veterans Day
11/11/2020,Veterans Day
11/11/2021,Veterans Day
11/11/2022,Veterans Day
11/12/2012,Veterans Day
11/12/2018,Veterans Day
11/22/2012,Thanksgiving Day
11/22/2018,Thanksgiving Day
11/23/2017,Thanksgiving Day
11/24/2011,Thanksgiving Day
11/24/2016,Thanksgiving Day
11/24/2022,Thanksgiving Day
11/25/2021,Thanksgiving Day
11/26/2015,Thanksgiving Day
11/26/2020,Thanksgiving Day
11/27/2014,Thanksgiving Day
11/28/2013,Thanksgiving Day
11/28/2019,Thanksgiving Day
12/24/2021,Christmas Day
12/25/2012,Christmas Day
12/25/2013,Christmas Day
12/25/2014,Christmas Day
12/25/2015,Christmas Day
12/25/2017,Christmas Day
12/25/2018,Christmas Day
12/25/2019,Christmas Day
12/25/2020,Christmas Day
12/26/2011,Christmas Day
12/26/2016,Christmas Day
12/26/2022,Christmas Day
12/31/2022,New Year's Day
This diff is collapsed.
This diff is collapsed.
%% Cell type:code id:c8dca847-54af-4284-97d8-0682e88a6e8d tags:
``` python
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("cs544")
.master("spark://boss:7077")
.config("spark.executor.memory", "2G")
.config("spark.sql.warehouse.dir", "hdfs://nn:9000/user/hive/warehouse")
.enableHiveSupport()
.getOrCreate())
```
%% Output
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/27 01:41:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
%% Cell type:code id:2294e4e0-ab19-496c-980f-31df757e7837 tags:
``` python
!hdfs dfs -cp sf.csv hdfs://nn:9000/sf.csv
```
%% Cell type:code id:cb54bacc-b52a-4c25-93d2-2ba0f61de9b0 tags:
``` python
df = (spark.read.format("csv")
.option("header", True)
.option("inferSchema", True)
.load("hdfs://nn:9000/sf.csv"))
```
%% Output
%% Cell type:code id:c1298818-83f6-444b-b8a0-4be5b16fd6fb tags:
``` python
from pyspark.sql.functions import col, expr
cols = [col(c).alias(c.replace(" ", "_")) for c in df.columns]
df.select(cols).write.format("parquet").save("hdfs://nn:9000/sf.parquet")
```
%% Output
23/10/27 01:43:57 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
%% Cell type:code id:37d1ded3-ed8a-4e39-94cb-dd3a3272af91 tags:
``` python
!hdfs dfs -rm hdfs://nn:9000/sf.csv
```
%% Cell type:code id:abea48b5-e012-4ae2-a53a-e40350f94e20 tags:
``` python
df = spark.read.format("parquet").load("hdfs://nn:9000/sf.parquet")
```