{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "c8dca847-54af-4284-97d8-0682e88a6e8d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", "23/10/27 01:41:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" ] } ], "source": [ "from pyspark.sql import SparkSession\n", "spark = (SparkSession.builder.appName(\"cs544\")\n", " .master(\"spark://boss:7077\")\n", " .config(\"spark.executor.memory\", \"2G\")\n", " .config(\"spark.sql.warehouse.dir\", \"hdfs://nn:9000/user/hive/warehouse\")\n", " .enableHiveSupport()\n", " .getOrCreate())" ] }, { "cell_type": "code", "execution_count": 2, "id": "2294e4e0-ab19-496c-980f-31df757e7837", "metadata": {}, "outputs": [], "source": [ "!hdfs dfs -cp sf.csv hdfs://nn:9000/sf.csv" ] }, { "cell_type": "code", "execution_count": 3, "id": "cb54bacc-b52a-4c25-93d2-2ba0f61de9b0", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " \r" ] } ], "source": [ "df = (spark.read.format(\"csv\")\n", " .option(\"header\", True)\n", " .option(\"inferSchema\", True)\n", " .load(\"hdfs://nn:9000/sf.csv\"))" ] }, { "cell_type": "code", "execution_count": 5, "id": "c1298818-83f6-444b-b8a0-4be5b16fd6fb", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "23/10/27 01:43:57 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n", " \r" ] } ], "source": [ "from pyspark.sql.functions import col, expr\n", "cols = [col(c).alias(c.replace(\" \", \"_\")) for c in df.columns]\n", "df.select(cols).write.format(\"parquet\").save(\"hdfs://nn:9000/sf.parquet\")" ] }, { "cell_type": "code", "execution_count": null, "id": "37d1ded3-ed8a-4e39-94cb-dd3a3272af91", "metadata": {}, "outputs": [], "source": [ "!hdfs dfs -rm hdfs://nn:9000/sf.csv" ] }, { "cell_type": "code", "execution_count": null, "id": "abea48b5-e012-4ae2-a53a-e40350f94e20", "metadata": {}, "outputs": [], "source": [ "df = spark.read.format(\"parquet\").load(\"hdfs://nn:9000/sf.parquet\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }