Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • cdis/cs/courses/cs544/s25/main
  • zzhang2478/main
  • spark667/main
  • vijayprabhak/main
  • vijayprabhak/544-main
  • wyang338/cs-544-s-25
  • jmin39/main
7 results
Show changes
Showing with 124220 additions and 0 deletions
FROM p5-base
CMD ["bash", "-c", "hdfs datanode -fs hdfs://nn:9000"]
name: ${PROJECT}
services:
nb:
image: ${PROJECT}-nb
ports:
- "127.0.0.1:5000:5000"
- "127.0.0.1:4040:4040"
volumes:
- "./nb:/nb"
deploy:
resources:
limits:
memory: 1.5G
nn:
image: ${PROJECT}-nn
hostname: nn
deploy:
resources:
limits:
memory: 1G
dn:
image: ${PROJECT}-dn
depends_on:
- nn
deploy:
resources:
limits:
memory: 1G
spark-boss:
image: ${PROJECT}-boss
hostname: boss
deploy:
resources:
limits:
memory: 0.5G
spark-worker:
image: ${PROJECT}-worker
deploy:
replicas: 2
resources:
limits:
memory: 2G
This diff is collapsed.
p5/image.png

119 KiB

FROM p5-base
RUN hdfs namenode -format -force
CMD ["bash", "-c", "hdfs namenode -format -force && hdfs namenode -fs hdfs://nn:9000"]
FROM p5-base
RUN apt-get update; apt-get install -y unzip
CMD ["python3", "-m", "jupyterlab", "--no-browser", "--ip=0.0.0.0", "--port=5000", "--allow-root", "--NotebookApp.token=''"]
FROM ubuntu:22.04
# Install required packages
RUN apt-get update && apt-get install -y \
wget \
curl \
openjdk-11-jdk \
python3-pip \
net-tools \
lsof \
nano \
sudo
# Install Python dependencies
RUN pip3 install jupyterlab==4.3.5 pandas==2.2.3 pyspark==3.5.5 matplotlib==3.10.1
# Download and extract Hadoop
RUN wget https://dlcdn.apache.org/hadoop/common/hadoop-3.4.1/hadoop-3.4.1.tar.gz && \
tar -xf hadoop-3.4.1.tar.gz && \
rm hadoop-3.4.1.tar.gz
# Download and extract Spark
RUN wget https://dlcdn.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz && \
tar -xf spark-3.5.5-bin-hadoop3.tgz && \
rm spark-3.5.5-bin-hadoop3.tgz
# Set environment variables
ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
ENV PATH="${PATH}:/hadoop-3.4.1/bin"
ENV HADOOP_HOME=/hadoop-3.4.1
FROM ubuntu:22.04
RUN apt-get update; apt-get install -y wget curl openjdk-17-jdk python3-pip iproute2
# Python stuff
RUN pip3 install numpy==2.1.3 pyspark==3.4.1 cassandra-driver==3.28.0 grpcio==1.58.0 grpcio-tools==1.58.0
# Install packages in requirements.txt, you can add more packages you need to the requirements.txt file
COPY /src/requirements.txt /requirements.txt
RUN pip3 install -r /requirements.txt
# SPARK
RUN wget https://dlcdn.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz && \
tar -xf spark-3.5.5-bin-hadoop3.tgz && \
rm spark-3.5.5-bin-hadoop3.tgz
# CASSANDRA
RUN wget https://archive.apache.org/dist/cassandra/5.0.0/apache-cassandra-5.0.0-bin.tar.gz; tar -xf apache-cassandra-5.0.0-bin.tar.gz; rm apache-cassandra-5.0.0-bin.tar.gz
ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
ENV PATH="${PATH}:/apache-cassandra-5.0.0/bin:/spark-3.4.1-bin-hadoop3.2/bin"
COPY cassandra.sh /cassandra.sh
CMD ["sh", "/cassandra.sh"]
This diff is collapsed.
# get the environment variable
PROJECT=${PROJECT}
sed -i "s/^listen_address:.*/listen_address: "`hostname`"/" /apache-cassandra-5.0.0/conf/cassandra.yaml
sed -i "s/^rpc_address:.*/rpc_address: "`hostname`"/" /apache-cassandra-5.0.0/conf/cassandra.yaml
sed -i "s/- seeds:.*/- seeds: ${PROJECT}-db-1,${PROJECT}-db-2,${PROJECT}-db-3/" /apache-cassandra-5.0.0/conf/cassandra.yaml
/apache-cassandra-5.0.0/bin/cassandra -R
sleep infinity
name: ${PROJECT}
services:
db:
image: ${PROJECT}
hostname: db
volumes:
- "./src:/src"
deploy:
replicas: 3
resources:
limits:
memory: 2.5G
environment:
- PROJECT=${PROJECT}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.