From 9869b90e97d9a34ced6f07fefc0306580d787192 Mon Sep 17 00:00:00 2001 From: TYLER CARAZA-HARTER <tharter@cs544-tharter.cs.wisc.edu> Date: Fri, 21 Mar 2025 15:25:54 -0500 Subject: [PATCH] lec demos --- lec/26-cassandra/src/lec1.ipynb | 401 +++++++++++ lec/26-cassandra/src/lec2.ipynb | 583 ++++++++++++++++ lec/27-cassandra/hash.ipynb | 261 +++++++ lec/27-cassandra/lec.ipynb | 1166 +++++++++++++++++++++++++++++++ p6/Dockerfile.cassandra | 4 +- 5 files changed, 2413 insertions(+), 2 deletions(-) create mode 100644 lec/26-cassandra/src/lec1.ipynb create mode 100644 lec/26-cassandra/src/lec2.ipynb create mode 100644 lec/27-cassandra/hash.ipynb create mode 100644 lec/27-cassandra/lec.ipynb diff --git a/lec/26-cassandra/src/lec1.ipynb b/lec/26-cassandra/src/lec1.ipynb new file mode 100644 index 0000000..7fc65ca --- /dev/null +++ b/lec/26-cassandra/src/lec1.ipynb @@ -0,0 +1,401 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "0ff1c81b-9867-4f98-b227-c89871ff04bf", + "metadata": {}, + "outputs": [], + "source": [ + "from cassandra.cluster import Cluster\n", + "cluster = Cluster([\"p6-db-1\", \"p6-db-2\", \"p6-db-3\"])\n", + "cass = cluster.connect()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3d807c68-ac2a-4fe3-8153-919ea4c0f302", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x76c2245a83d0>" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(\"use banking\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "93a16129-d410-4bce-9034-4c792a74188b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x76c21c122500>" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(\"\"\"\n", + "CREATE TABLE loans (\n", + " bank_id INT,\n", + " bank_name text STATIC,\n", + " loan_id UUID,\n", + " amount int,\n", + " state text,\n", + " PRIMARY KEY ((bank_id), amount, loan_id)\n", + ") WITH CLUSTERING ORDER BY (amount DESC)\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ec0308f4-5132-4e1e-a24e-579e06cc3105", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CREATE TABLE banking.loans (\n", + " bank_id int,\n", + " amount int,\n", + " loan_id uuid,\n", + " bank_name text static,\n", + " state text,\n", + " PRIMARY KEY (bank_id, amount, loan_id)\n", + ") WITH CLUSTERING ORDER BY (amount DESC, loan_id ASC)\n", + " AND additional_write_policy = '99p'\n", + " AND allow_auto_snapshot = true\n", + " AND bloom_filter_fp_chance = 0.01\n", + " AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}\n", + " AND cdc = false\n", + " AND comment = ''\n", + " AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'}\n", + " AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}\n", + " AND memtable = 'default'\n", + " AND crc_check_chance = 1.0\n", + " AND default_time_to_live = 0\n", + " AND extensions = {}\n", + " AND gc_grace_seconds = 864000\n", + " AND incremental_backups = true\n", + " AND max_index_interval = 2048\n", + " AND memtable_flush_period_in_ms = 0\n", + " AND min_index_interval = 128\n", + " AND read_repair = 'BLOCKING'\n", + " AND speculative_retry = '99p';\n" + ] + } + ], + "source": [ + "print(cass.execute(\"DESCRIBE TABLE loans\").one().create_statement)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "7350e0c3-95b7-404e-b6f6-a469eac7169a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x76c21cd73160>" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# INSERT in Cassandra is an UPSERT: update or insert\n", + "cass.execute(\"\"\"\n", + "INSERT INTO loans (bank_id, bank_name)\n", + "VALUES (544, 'test2')\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f10be20a-15bb-4092-9a08-272698012813", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>bank_id</th>\n", + " <th>amount</th>\n", + " <th>loan_id</th>\n", + " <th>bank_name</th>\n", + " <th>state</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>544</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>test2</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " bank_id amount loan_id bank_name state\n", + "0 544 None None test2 None" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "pd.DataFrame(cass.execute(\"SELECT * FROM loans\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "be2f1f47-f4be-49b5-90cf-8a9e0d69024a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x76c21c2aceb0>" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(\"\"\"\n", + "INSERT INTO loans (bank_id, bank_name, loan_id, amount)\n", + "VALUES (544, 'test2', UUID(), 300)\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "498c868d-41ee-4bc5-bb8d-d8a4e10b4464", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>bank_id</th>\n", + " <th>amount</th>\n", + " <th>loan_id</th>\n", + " <th>bank_name</th>\n", + " <th>state</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>544</td>\n", + " <td>300</td>\n", + " <td>9e912c13-91ad-4d9c-b29c-0ac48d6231fe</td>\n", + " <td>test2</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " bank_id amount loan_id bank_name state\n", + "0 544 300 9e912c13-91ad-4d9c-b29c-0ac48d6231fe test2 None" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(cass.execute(\"SELECT * FROM loans\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "714bedc8-6a48-4aef-b706-cd30c274473b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x76c21d7a3910>" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NOW and UUID generate UUID. Supposed to be universally unique.\n", + "# NOW uses MAC addrs and timestamps to make it guaranteed.\n", + "cass.execute(\"\"\"\n", + "INSERT INTO loans (bank_id, bank_name, loan_id, amount, state)\n", + "VALUES (544, 'mybank', NOW(), 400, 'wi')\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "7af36f0e-5db8-424d-9b08-1e8aacc2dacf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>bank_id</th>\n", + " <th>amount</th>\n", + " <th>loan_id</th>\n", + " <th>bank_name</th>\n", + " <th>state</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>544</td>\n", + " <td>400</td>\n", + " <td>05601530-0662-11f0-9e42-b531eb6d9b34</td>\n", + " <td>mybank</td>\n", + " <td>wi</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>544</td>\n", + " <td>300</td>\n", + " <td>9e912c13-91ad-4d9c-b29c-0ac48d6231fe</td>\n", + " <td>mybank</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " bank_id amount loan_id bank_name state\n", + "0 544 400 05601530-0662-11f0-9e42-b531eb6d9b34 mybank wi\n", + "1 544 300 9e912c13-91ad-4d9c-b29c-0ac48d6231fe mybank None" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# above did INSERT of a new row; UPSERT on the partition\n", + "pd.DataFrame(cass.execute(\"SELECT * FROM loans\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lec/26-cassandra/src/lec2.ipynb b/lec/26-cassandra/src/lec2.ipynb new file mode 100644 index 0000000..ea06f4d --- /dev/null +++ b/lec/26-cassandra/src/lec2.ipynb @@ -0,0 +1,583 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "11035b24-6195-412f-af98-50e41ce8b3d0", + "metadata": {}, + "outputs": [], + "source": [ + "from cassandra.cluster import Cluster\n", + "cluster = Cluster([\"p6-db-1\", \"p6-db-2\", \"p6-db-3\"])\n", + "cass = cluster.connect()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "480162ed-d618-4dde-bda6-03249f609a69", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70c6d9b05330>" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(\"use banking\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ec630031-d518-4747-ac1f-8ec40aa43251", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70c6d9b06500>" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(\"\"\"\n", + "create table loans (\n", + " bank_id int,\n", + " bank_name text STATIC,\n", + " loan_id UUID,\n", + " amount int,\n", + " state text,\n", + " PRIMARY KEY ((bank_id), amount, loan_id)\n", + ") WITH CLUSTERING ORDER BY (amount DESC)\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "286ead09-fbf5-491d-831e-fb0056edd134", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CREATE TABLE banking.loans (\n", + " bank_id int,\n", + " amount int,\n", + " loan_id uuid,\n", + " bank_name text static,\n", + " state text,\n", + " PRIMARY KEY (bank_id, amount, loan_id)\n", + ") WITH CLUSTERING ORDER BY (amount DESC, loan_id ASC)\n", + " AND additional_write_policy = '99p'\n", + " AND allow_auto_snapshot = true\n", + " AND bloom_filter_fp_chance = 0.01\n", + " AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}\n", + " AND cdc = false\n", + " AND comment = ''\n", + " AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'}\n", + " AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}\n", + " AND memtable = 'default'\n", + " AND crc_check_chance = 1.0\n", + " AND default_time_to_live = 0\n", + " AND extensions = {}\n", + " AND gc_grace_seconds = 864000\n", + " AND incremental_backups = true\n", + " AND max_index_interval = 2048\n", + " AND memtable_flush_period_in_ms = 0\n", + " AND min_index_interval = 128\n", + " AND read_repair = 'BLOCKING'\n", + " AND speculative_retry = '99p';\n" + ] + } + ], + "source": [ + "print(cass.execute(\"describe table loans\").one().create_statement)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "c4f6335b-6340-455e-921c-4b6ac2e1d67b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70c6d17263e0>" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(\"\"\"\n", + "INSERT INTO loans (bank_id, bank_name)\n", + "VALUES (544, 'test')\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "55c10edd-126d-4a10-98c8-708f2b337f99", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70c6d9b056c0>" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# INSERT is actually UPSERT (insert or update)\n", + "cass.execute(\"\"\"\n", + "INSERT INTO loans (bank_id, bank_name)\n", + "VALUES (544, 'test2')\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "99035291-59eb-4c00-99a1-9737d4e406d9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>bank_id</th>\n", + " <th>amount</th>\n", + " <th>loan_id</th>\n", + " <th>bank_name</th>\n", + " <th>state</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>544</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>test2</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " bank_id amount loan_id bank_name state\n", + "0 544 None None test2 None" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "pd.DataFrame(cass.execute(\"SELECT * FROM loans\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "896b8215-7282-4b73-9ad2-8f0761879253", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70c6d1726830>" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(\"\"\"\n", + "INSERT INTO loans (bank_id, bank_name, loan_id, amount)\n", + "VALUES (544, 'test2', UUID(), 300)\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "b8e2a46a-6e86-49ef-9d1c-0da26eec0a53", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>bank_id</th>\n", + " <th>amount</th>\n", + " <th>loan_id</th>\n", + " <th>bank_name</th>\n", + " <th>state</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>544</td>\n", + " <td>300</td>\n", + " <td>01976a29-7d80-435f-ba6b-e22abc9d10f3</td>\n", + " <td>test2</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " bank_id amount loan_id bank_name state\n", + "0 544 300 01976a29-7d80-435f-ba6b-e22abc9d10f3 test2 None" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(cass.execute(\"SELECT * FROM loans\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "c87c69d1-4a9e-4065-9a95-8e47a08755c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70c6d3f9eec0>" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NOW and UUID both generate UUIDs.\n", + "# NOW is stronge because it uses MAC addresses and timestamps\n", + "\n", + "# this is both an INSERT (inserting a row) and UPDATE (on the partition)\n", + "cass.execute(\"\"\"\n", + "INSERT INTO loans (bank_id, bank_name, loan_id, amount, state)\n", + "VALUES (544, 'mybank2', NOW(), 350, 'wi')\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "032cc612-c753-4bbe-bac3-ca5a8df6cd02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>bank_id</th>\n", + " <th>amount</th>\n", + " <th>loan_id</th>\n", + " <th>bank_name</th>\n", + " <th>state</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>544</td>\n", + " <td>400</td>\n", + " <td>2b644550-0674-11f0-9e42-b531eb6d9b34</td>\n", + " <td>mybank2</td>\n", + " <td>wi</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>544</td>\n", + " <td>350</td>\n", + " <td>5c852cd0-0674-11f0-8b0a-b3bc8dc2bdb9</td>\n", + " <td>mybank2</td>\n", + " <td>wi</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>544</td>\n", + " <td>300</td>\n", + " <td>aebdb7c0-0673-11f0-8b0a-b3bc8dc2bdb9</td>\n", + " <td>mybank2</td>\n", + " <td>wi</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>544</td>\n", + " <td>300</td>\n", + " <td>f8c80870-0673-11f0-8acf-b5f913312dcb</td>\n", + " <td>mybank2</td>\n", + " <td>wi</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>544</td>\n", + " <td>300</td>\n", + " <td>01976a29-7d80-435f-ba6b-e22abc9d10f3</td>\n", + " <td>mybank2</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " bank_id amount loan_id bank_name state\n", + "0 544 400 2b644550-0674-11f0-9e42-b531eb6d9b34 mybank2 wi\n", + "1 544 350 5c852cd0-0674-11f0-8b0a-b3bc8dc2bdb9 mybank2 wi\n", + "2 544 300 aebdb7c0-0673-11f0-8b0a-b3bc8dc2bdb9 mybank2 wi\n", + "3 544 300 f8c80870-0673-11f0-8acf-b5f913312dcb mybank2 wi\n", + "4 544 300 01976a29-7d80-435f-ba6b-e22abc9d10f3 mybank2 None" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(cass.execute(\"SELECT * FROM loans\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "d8142b23-c6c4-4b77-b5dd-8684c47c05c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70c6ad556e30>" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(\"\"\"\n", + "INSERT INTO loans (bank_id, bank_name, loan_id, amount, state)\n", + "VALUES (999, 'uwcu', NOW(), 500, 'il')\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "b3b1ef23-db1a-43c8-b572-09a48f4aba41", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>bank_id</th>\n", + " <th>amount</th>\n", + " <th>loan_id</th>\n", + " <th>bank_name</th>\n", + " <th>state</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>544</td>\n", + " <td>400</td>\n", + " <td>2b644550-0674-11f0-9e42-b531eb6d9b34</td>\n", + " <td>mybank2</td>\n", + " <td>wi</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>544</td>\n", + " <td>350</td>\n", + " <td>5c852cd0-0674-11f0-8b0a-b3bc8dc2bdb9</td>\n", + " <td>mybank2</td>\n", + " <td>wi</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>544</td>\n", + " <td>300</td>\n", + " <td>aebdb7c0-0673-11f0-8b0a-b3bc8dc2bdb9</td>\n", + " <td>mybank2</td>\n", + " <td>wi</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>544</td>\n", + " <td>300</td>\n", + " <td>f8c80870-0673-11f0-8acf-b5f913312dcb</td>\n", + " <td>mybank2</td>\n", + " <td>wi</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>544</td>\n", + " <td>300</td>\n", + " <td>01976a29-7d80-435f-ba6b-e22abc9d10f3</td>\n", + " <td>mybank2</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>999</td>\n", + " <td>500</td>\n", + " <td>78e4a9f0-0674-11f0-8acf-b5f913312dcb</td>\n", + " <td>uwcu</td>\n", + " <td>il</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " bank_id amount loan_id bank_name state\n", + "0 544 400 2b644550-0674-11f0-9e42-b531eb6d9b34 mybank2 wi\n", + "1 544 350 5c852cd0-0674-11f0-8b0a-b3bc8dc2bdb9 mybank2 wi\n", + "2 544 300 aebdb7c0-0673-11f0-8b0a-b3bc8dc2bdb9 mybank2 wi\n", + "3 544 300 f8c80870-0673-11f0-8acf-b5f913312dcb mybank2 wi\n", + "4 544 300 01976a29-7d80-435f-ba6b-e22abc9d10f3 mybank2 None\n", + "5 999 500 78e4a9f0-0674-11f0-8acf-b5f913312dcb uwcu il" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(cass.execute(\"SELECT * FROM loans\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lec/27-cassandra/hash.ipynb b/lec/27-cassandra/hash.ipynb new file mode 100644 index 0000000..45cf799 --- /dev/null +++ b/lec/27-cassandra/hash.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "id": "19e1af7a-a39d-4382-9e7a-f720a23baba7", + "metadata": {}, + "outputs": [], + "source": [ + "import string\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "da2a5620-5326-4031-a63c-eb1677e9d92f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'ABCDEFGHIJKLMNOPQRSTUVWXYZ'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "string.ascii_uppercase" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e594fb1d-c840-4a33-ad15-bb86b5053037", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>letter</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>A</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>B</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>C</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>D</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>E</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " letter\n", + "0 A\n", + "1 B\n", + "2 C\n", + "3 D\n", + "4 E" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({\"letter\": list(string.ascii_uppercase)})\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3b9d113f-7d73-44ec-9df8-b78c1b64a999", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hash(\"A\") % 4" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "85d5af92-2a7d-4588-881e-d6ae27d53174", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>letter</th>\n", + " <th>partition-before</th>\n", + " <th>partition-after</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>A</td>\n", + " <td>3</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>B</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>C</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>D</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>E</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " letter partition-before partition-after\n", + "0 A 3 3\n", + "1 B 0 1\n", + "2 C 2 1\n", + "3 D 0 0\n", + "4 E 1 1" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"partition-before\"] = df[\"letter\"].map(lambda letter: hash(letter) % 4) # when we have 4 machines\n", + "df[\"partition-after\"] = df[\"letter\"].map(lambda letter: hash(letter) % 5) # when we have 5 machines\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f9fb0213-6fa3-4b5d-8e8b-acf13929e5db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.34615384615384615" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 35% of letters stayed on the same machine when we scaled from 4 to 5, the rest moved\n", + "float((df[\"partition-before\"] == df[\"partition-after\"]).astype(int).mean())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lec/27-cassandra/lec.ipynb b/lec/27-cassandra/lec.ipynb new file mode 100644 index 0000000..cafed45 --- /dev/null +++ b/lec/27-cassandra/lec.ipynb @@ -0,0 +1,1166 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "0ff1c81b-9867-4f98-b227-c89871ff04bf", + "metadata": {}, + "outputs": [], + "source": [ + "from cassandra.cluster import Cluster\n", + "cluster = Cluster([\"p6-db-1\", \"p6-db-2\", \"p6-db-3\"])\n", + "cass = cluster.connect()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3d807c68-ac2a-4fe3-8153-919ea4c0f302", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70d240bc3c40>" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(\"use banking\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d55396a4-33e6-43c7-842e-d8994c2f5acd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70d240bc0250>" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(\"drop table if exists loans\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "93a16129-d410-4bce-9034-4c792a74188b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70d238dba320>" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(\"\"\"\n", + "CREATE TABLE loans (\n", + " bank_id INT,\n", + " bank_name text STATIC,\n", + " loan_id UUID,\n", + " amount int,\n", + " state text,\n", + " PRIMARY KEY ((bank_id), amount, loan_id)\n", + ") WITH CLUSTERING ORDER BY (amount DESC)\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ec0308f4-5132-4e1e-a24e-579e06cc3105", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CREATE TABLE banking.loans (\n", + " bank_id int,\n", + " amount int,\n", + " loan_id uuid,\n", + " bank_name text static,\n", + " state text,\n", + " PRIMARY KEY (bank_id, amount, loan_id)\n", + ") WITH CLUSTERING ORDER BY (amount DESC, loan_id ASC)\n", + " AND additional_write_policy = '99p'\n", + " AND allow_auto_snapshot = true\n", + " AND bloom_filter_fp_chance = 0.01\n", + " AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}\n", + " AND cdc = false\n", + " AND comment = ''\n", + " AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'}\n", + " AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}\n", + " AND memtable = 'default'\n", + " AND crc_check_chance = 1.0\n", + " AND default_time_to_live = 0\n", + " AND extensions = {}\n", + " AND gc_grace_seconds = 864000\n", + " AND incremental_backups = true\n", + " AND max_index_interval = 2048\n", + " AND memtable_flush_period_in_ms = 0\n", + " AND min_index_interval = 128\n", + " AND read_repair = 'BLOCKING'\n", + " AND speculative_retry = '99p';\n" + ] + } + ], + "source": [ + "print(cass.execute(\"DESCRIBE TABLE loans\").one().create_statement)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7350e0c3-95b7-404e-b6f6-a469eac7169a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70d238de1870>" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# INSERT in Cassandra is an UPSERT: update or insert\n", + "cass.execute(\"\"\"\n", + "INSERT INTO loans (bank_id, bank_name)\n", + "VALUES (544, 'test2')\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f10be20a-15bb-4092-9a08-272698012813", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>bank_id</th>\n", + " <th>amount</th>\n", + " <th>loan_id</th>\n", + " <th>bank_name</th>\n", + " <th>state</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>544</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>test2</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " bank_id amount loan_id bank_name state\n", + "0 544 None None test2 None" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "pd.DataFrame(cass.execute(\"SELECT * FROM loans\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "be2f1f47-f4be-49b5-90cf-8a9e0d69024a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70d238de12d0>" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(\"\"\"\n", + "INSERT INTO loans (bank_id, bank_name, loan_id, amount)\n", + "VALUES (544, 'test2', UUID(), 300)\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "498c868d-41ee-4bc5-bb8d-d8a4e10b4464", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>bank_id</th>\n", + " <th>amount</th>\n", + " <th>loan_id</th>\n", + " <th>bank_name</th>\n", + " <th>state</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>544</td>\n", + " <td>300</td>\n", + " <td>3daed7cd-7e7a-4107-a11b-35ef10ffc035</td>\n", + " <td>test2</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " bank_id amount loan_id bank_name state\n", + "0 544 300 3daed7cd-7e7a-4107-a11b-35ef10ffc035 test2 None" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(cass.execute(\"SELECT * FROM loans\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "714bedc8-6a48-4aef-b706-cd30c274473b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70d238dba620>" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NOW and UUID generate UUID. Supposed to be universally unique.\n", + "# NOW uses MAC addrs and timestamps to make it guaranteed.\n", + "cass.execute(\"\"\"\n", + "INSERT INTO loans (bank_id, bank_name, loan_id, amount, state)\n", + "VALUES (544, 'mybank', NOW(), 400, 'wi')\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "7af36f0e-5db8-424d-9b08-1e8aacc2dacf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>bank_id</th>\n", + " <th>amount</th>\n", + " <th>loan_id</th>\n", + " <th>bank_name</th>\n", + " <th>state</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>544</td>\n", + " <td>400</td>\n", + " <td>794087c0-0683-11f0-8b0a-b3bc8dc2bdb9</td>\n", + " <td>mybank</td>\n", + " <td>wi</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>544</td>\n", + " <td>300</td>\n", + " <td>3daed7cd-7e7a-4107-a11b-35ef10ffc035</td>\n", + " <td>mybank</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " bank_id amount loan_id bank_name state\n", + "0 544 400 794087c0-0683-11f0-8b0a-b3bc8dc2bdb9 mybank wi\n", + "1 544 300 3daed7cd-7e7a-4107-a11b-35ef10ffc035 mybank None" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# above did INSERT of a new row; UPSERT on the partition\n", + "pd.DataFrame(cass.execute(\"SELECT * FROM loans\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "33c033d2-3aa4-4b32-8248-7b96117c7099", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70d240bc1210>" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(\"\"\"\n", + "INSERT INTO loans (bank_id, bank_name, loan_id, amount, state)\n", + "VALUES (999, 'uwcu', NOW(), 500, 'il')\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c8153e23-2f92-40f8-b9c1-6241c365ea7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>bank_id</th>\n", + " <th>amount</th>\n", + " <th>loan_id</th>\n", + " <th>bank_name</th>\n", + " <th>state</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>544</td>\n", + " <td>400</td>\n", + " <td>794087c0-0683-11f0-8b0a-b3bc8dc2bdb9</td>\n", + " <td>mybank</td>\n", + " <td>wi</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>544</td>\n", + " <td>300</td>\n", + " <td>3daed7cd-7e7a-4107-a11b-35ef10ffc035</td>\n", + " <td>mybank</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>999</td>\n", + " <td>500</td>\n", + " <td>d7faddb0-0683-11f0-9e42-b531eb6d9b34</td>\n", + " <td>uwcu</td>\n", + " <td>il</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " bank_id amount loan_id bank_name state\n", + "0 544 400 794087c0-0683-11f0-8b0a-b3bc8dc2bdb9 mybank wi\n", + "1 544 300 3daed7cd-7e7a-4107-a11b-35ef10ffc035 mybank None\n", + "2 999 500 d7faddb0-0683-11f0-9e42-b531eb6d9b34 uwcu il" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(cass.execute(\"SELECT * FROM loans\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "25b33372-08ad-4d4a-9f8a-389883360b2d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70d240bc2fe0>" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# TODO: create a new type for names\n", + "cass.execute(\"\"\"\n", + "CREATE TYPE FullName (\n", + " first text,\n", + " last text\n", + ")\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c6b42f4a-eb53-477d-aff2-3bd72845749c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70d207260d30>" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(\"\"\"\n", + "ALTER TABLE loans ADD (name FullName);\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "af6c7d02-5185-44ae-a0f2-c743feae1927", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>bank_id</th>\n", + " <th>amount</th>\n", + " <th>loan_id</th>\n", + " <th>bank_name</th>\n", + " <th>name</th>\n", + " <th>state</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>544</td>\n", + " <td>400</td>\n", + " <td>794087c0-0683-11f0-8b0a-b3bc8dc2bdb9</td>\n", + " <td>mybank</td>\n", + " <td>None</td>\n", + " <td>wi</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>544</td>\n", + " <td>300</td>\n", + " <td>3daed7cd-7e7a-4107-a11b-35ef10ffc035</td>\n", + " <td>mybank</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>999</td>\n", + " <td>500</td>\n", + " <td>d7faddb0-0683-11f0-9e42-b531eb6d9b34</td>\n", + " <td>uwcu</td>\n", + " <td>None</td>\n", + " <td>il</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " bank_id amount loan_id bank_name name state\n", + "0 544 400 794087c0-0683-11f0-8b0a-b3bc8dc2bdb9 mybank None wi\n", + "1 544 300 3daed7cd-7e7a-4107-a11b-35ef10ffc035 mybank None None\n", + "2 999 500 d7faddb0-0683-11f0-9e42-b531eb6d9b34 uwcu None il" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cassandra has sparse tables, so adding these nulls doesn't requiring modifying a bunch for rows\n", + "pd.DataFrame(cass.execute(\"SELECT * FROM loans\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "d0485f70-3919-437c-9e56-b22569c61cb0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70d2072cd780>" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(\"\"\"\n", + "INSERT INTO loans (bank_id, bank_name, loan_id, amount, state, name)\n", + "VALUES (999, 'uwcu', NOW(), 500, 'il', {first: 'Tyler', last:'C'})\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "ced5a9f4-2e8e-4956-9d08-a632bc4b0960", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>bank_id</th>\n", + " <th>amount</th>\n", + " <th>loan_id</th>\n", + " <th>bank_name</th>\n", + " <th>name</th>\n", + " <th>state</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>544</td>\n", + " <td>400</td>\n", + " <td>794087c0-0683-11f0-8b0a-b3bc8dc2bdb9</td>\n", + " <td>mybank</td>\n", + " <td>None</td>\n", + " <td>wi</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>544</td>\n", + " <td>300</td>\n", + " <td>3daed7cd-7e7a-4107-a11b-35ef10ffc035</td>\n", + " <td>mybank</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>999</td>\n", + " <td>500</td>\n", + " <td>d7faddb0-0683-11f0-9e42-b531eb6d9b34</td>\n", + " <td>uwcu</td>\n", + " <td>None</td>\n", + " <td>il</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>999</td>\n", + " <td>500</td>\n", + " <td>5866b0a0-0684-11f0-8acf-b5f913312dcb</td>\n", + " <td>uwcu</td>\n", + " <td>(Tyler, C)</td>\n", + " <td>il</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " bank_id amount loan_id bank_name \\\n", + "0 544 400 794087c0-0683-11f0-8b0a-b3bc8dc2bdb9 mybank \n", + "1 544 300 3daed7cd-7e7a-4107-a11b-35ef10ffc035 mybank \n", + "2 999 500 d7faddb0-0683-11f0-9e42-b531eb6d9b34 uwcu \n", + "3 999 500 5866b0a0-0684-11f0-8acf-b5f913312dcb uwcu \n", + "\n", + " name state \n", + "0 None wi \n", + "1 None None \n", + "2 None il \n", + "3 (Tyler, C) il " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(cass.execute(\"SELECT * FROM loans\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "dd3051d3-efe0-4d15-abf8-2b8db5c429ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>name_first</th>\n", + " <th>name_last</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Tyler</td>\n", + " <td>C</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " name_first name_last\n", + "0 None None\n", + "1 None None\n", + "2 None None\n", + "3 Tyler C" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(cass.execute(\"SELECT name.first, name.last FROM loans\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "2bfc163c-994e-43f4-bbe3-291c0324e040", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>f</th>\n", + " <th>l</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Tyler</td>\n", + " <td>C</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " f l\n", + "0 None None\n", + "1 Tyler C" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(cass.execute(\"SELECT name.first AS f, name.last as l FROM loans WHERE bank_id = 999\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "8fe52763-0c3d-40a3-98f3-9435e2401628", + "metadata": {}, + "outputs": [], + "source": [ + "# prepared statement" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "ee49b471-f7bb-4bf0-86d6-211c5f314fa2", + "metadata": {}, + "outputs": [], + "source": [ + "inst_stmt = cass.prepare(\"\"\"\n", + "INSERT INTO loans (bank_id, loan_id, amount, name)\n", + "VALUES (999, NOW(), ?, {first: ?, last: ?})\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "4a3f64c0-33cc-4b89-b170-3bb875ceaeaa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<cassandra.cluster.ResultSet at 0x70d207261390>" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cass.execute(inst_stmt, (345, 'Tyler', 'C'))" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "027c3b62-4f17-4358-b5d9-ae22eed54a91", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>bank_id</th>\n", + " <th>amount</th>\n", + " <th>loan_id</th>\n", + " <th>bank_name</th>\n", + " <th>name</th>\n", + " <th>state</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>544</td>\n", + " <td>400</td>\n", + " <td>794087c0-0683-11f0-8b0a-b3bc8dc2bdb9</td>\n", + " <td>mybank</td>\n", + " <td>None</td>\n", + " <td>wi</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>544</td>\n", + " <td>300</td>\n", + " <td>3daed7cd-7e7a-4107-a11b-35ef10ffc035</td>\n", + " <td>mybank</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>999</td>\n", + " <td>500</td>\n", + " <td>d7faddb0-0683-11f0-9e42-b531eb6d9b34</td>\n", + " <td>uwcu</td>\n", + " <td>None</td>\n", + " <td>il</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>999</td>\n", + " <td>500</td>\n", + " <td>5866b0a0-0684-11f0-8acf-b5f913312dcb</td>\n", + " <td>uwcu</td>\n", + " <td>(Tyler, C)</td>\n", + " <td>il</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>999</td>\n", + " <td>345</td>\n", + " <td>857de4d0-0686-11f0-8acf-b5f913312dcb</td>\n", + " <td>uwcu</td>\n", + " <td>(Tyler, C)</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " bank_id amount loan_id bank_name \\\n", + "0 544 400 794087c0-0683-11f0-8b0a-b3bc8dc2bdb9 mybank \n", + "1 544 300 3daed7cd-7e7a-4107-a11b-35ef10ffc035 mybank \n", + "2 999 500 d7faddb0-0683-11f0-9e42-b531eb6d9b34 uwcu \n", + "3 999 500 5866b0a0-0684-11f0-8acf-b5f913312dcb uwcu \n", + "4 999 345 857de4d0-0686-11f0-8acf-b5f913312dcb uwcu \n", + "\n", + " name state \n", + "0 None wi \n", + "1 None None \n", + "2 None il \n", + "3 (Tyler, C) il \n", + "4 (Tyler, C) None " + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(cass.execute(\"SELECT * FROM loans\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "30d95055-3815-4411-bf0f-1138dcde89aa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>bank_name</th>\n", + " <th>system_avg_amount</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>mybank</td>\n", + " <td>350</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>uwcu</td>\n", + " <td>448</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " bank_name system_avg_amount\n", + "0 mybank 350\n", + "1 uwcu 448" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(cass.execute(\"\"\"\n", + "SELECT bank_name, AVG(amount)\n", + "FROM loans\n", + "GROUP BY bank_id\n", + "\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "ea717e2a-1534-4866-a4ba-a10663af5cc5", + "metadata": {}, + "outputs": [ + { + "ename": "InvalidRequest", + "evalue": "Error from server: code=2200 [Invalid query] message=\"Group by is currently only supported on the columns of the PRIMARY KEY, got state\"", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mInvalidRequest\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[73], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m pd\u001b[38;5;241m.\u001b[39mDataFrame(\u001b[43mcass\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\"\"\u001b[39;49m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124;43mSELECT state, AVG(amount)\u001b[39;49m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124;43mFROM loans\u001b[39;49m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;124;43mGROUP BY state\u001b[39;49m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;124;43m\"\"\"\u001b[39;49m\u001b[43m)\u001b[49m)\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/cassandra/cluster.py:2637\u001b[0m, in \u001b[0;36mcassandra.cluster.Session.execute\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/cassandra/cluster.py:4920\u001b[0m, in \u001b[0;36mcassandra.cluster.ResponseFuture.result\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mInvalidRequest\u001b[0m: Error from server: code=2200 [Invalid query] message=\"Group by is currently only supported on the columns of the PRIMARY KEY, got state\"" + ] + } + ], + "source": [ + "# data is partitioned by partition key.\n", + "# grouping on something else would trigger a shuffle, which Cassandra doesn't support!\n", + "# TODO: write ETL job to get this into HDFS/Parquet+Spark\n", + "pd.DataFrame(cass.execute(\"\"\"\n", + "SELECT state, AVG(amount)\n", + "FROM loans\n", + "GROUP BY state\n", + "\"\"\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/p6/Dockerfile.cassandra b/p6/Dockerfile.cassandra index a92d1a0..9dc822f 100644 --- a/p6/Dockerfile.cassandra +++ b/p6/Dockerfile.cassandra @@ -1,5 +1,5 @@ -FROM ubuntu:24.04 -RUN apt-get update; apt-get install -y wget curl openjdk-17-jdk python3-pip net-tools lsof vim unzip +FROM ubuntu:22.04 +RUN apt-get update; apt-get install -y wget curl openjdk-17-jdk python3-pip iproute2 # Python stuff RUN pip3 install numpy==2.1.3 pyspark==3.4.1 cassandra-driver==3.28.0 grpcio==1.58.0 grpcio-tools==1.58.0 -- GitLab