From f6f2843ccfde4ced815a92bb7577dac106cc4f2c Mon Sep 17 00:00:00 2001 From: TYLER CARAZA-HARTER <tharter@cs544-tharter.cs.wisc.edu> Date: Fri, 28 Feb 2025 12:36:33 -0600 Subject: [PATCH] lec demos --- lec/15-sql/lec1.ipynb | 1287 +++++++++++++++++++++++++++++++++++ lec/15-sql/lec2.ipynb | 1505 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 2792 insertions(+) create mode 100644 lec/15-sql/lec1.ipynb create mode 100644 lec/15-sql/lec2.ipynb diff --git a/lec/15-sql/lec1.ipynb b/lec/15-sql/lec1.ipynb new file mode 100644 index 0000000..6c084b4 --- /dev/null +++ b/lec/15-sql/lec1.ipynb @@ -0,0 +1,1287 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "df568295-31af-4fde-b402-8adecdf57f13", + "metadata": {}, + "outputs": [], + "source": [ + "from sqlalchemy import create_engine, text\n", + "engine = create_engine(\"mysql+mysqlconnector://root:abc@127.0.0.1:3306/cs544\")\n", + "conn = engine.connect()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8cabbb35-3d75-44ee-886c-6aa871a01d68", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(conn.execute(text(\"\"\"\n", + " show tables\n", + "\"\"\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b68ba375-c9c9-4677-9d9d-310d1927a276", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<sqlalchemy.engine.cursor.CursorResult at 0x765a7c3ddb00>" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# table: users\n", + "# columns: id, name, phone\n", + "# name is required\n", + "# id uniquely identifies row\n", + "conn.execute(text(\"\"\"\n", + " create table users (\n", + " id int,\n", + " name text NOT NULL,\n", + " phone text,\n", + " PRIMARY KEY (id)\n", + " )\n", + "\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "00a5b5b9-8e91-4d90-99ad-e85a1756ea88", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<sqlalchemy.engine.cursor.CursorResult at 0x765a714c6820>" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute(text(\"\"\"\n", + " create table accounts (\n", + " user_id int,\n", + " name text NOT NULL,\n", + " amount int NOT NULL,\n", + " FOREIGN KEY (user_id) references users(id)\n", + " )\n", + "\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9739d58c-8004-4844-a609-7ed95bdbf9aa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('accounts',), ('users',)]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(conn.execute(text(\"\"\"\n", + " show tables\n", + "\"\"\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "1fc2171c-9f09-4b40-b5e0-fcb84870ec7d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<sqlalchemy.engine.cursor.CursorResult at 0x765a714c6f90>" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute(text(\"\"\"\n", + " INSERT INTO users (id, name) VALUES (1, \"tyler\")\n", + "\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "a96a9978-0927-4886-bd72-225150f9a5a2", + "metadata": {}, + "outputs": [], + "source": [ + "# conn.execute(text(\"\"\"\n", + "# INSERT INTO users (id, name) VALUES (1, \"tyler\")\n", + "# \"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0dbc816b-4f66-4b19-bcb6-e1b5212ef469", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(1, 'tyler', None)]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(conn.execute(text(\"\"\"\n", + " SELECT *\n", + " FROM users\n", + "\"\"\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "45c24702-0285-43fe-ae0e-b9d01adc2a37", + "metadata": {}, + "outputs": [], + "source": [ + "conn.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2a18a93d-20c1-4f02-875a-e6154ad7aef5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<sqlalchemy.engine.cursor.CursorResult at 0x765a842ece50>" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute(text(\"\"\"\n", + " INSERT INTO accounts (user_id, name, amount)\n", + " VALUES (1, \"A\", 10), (1, \"B\", 20)\n", + "\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "753e807c-51d8-4190-9e8b-92e95bc8030b", + "metadata": {}, + "outputs": [], + "source": [ + "conn.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "ca9c9379-ba53-42f0-9e11-eed9921adc01", + "metadata": {}, + "outputs": [], + "source": [ + "# this would break an invariant, so it's not allowed!\n", + "# foreign keys are still referencing user id 1\n", + "#\n", + "# conn.execute(text(\"\"\"\n", + "# DELETE FROM users WHERE id = 1;\n", + "# \"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "75ae1cff-684a-4c04-9ce3-f619edea898c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "de914434-5eb7-465b-aec4-8bec6953b623", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://raw.githubusercontent.com/cfpb/api/master/resources/datasets/hmda/code_sheets/\"\n", + "df = pd.read_csv(url + \"action_taken.csv\")\n", + "df.to_sql(\"actions\", conn, index=False, if_exists=\"replace\")\n", + "df = pd.read_csv(url + \"loan_type.csv\")\n", + "df.to_sql(\"loan_types\", conn, index=False, if_exists=\"replace\")\n", + "df = pd.read_csv(url + \"loan_purpose.csv\")\n", + "df.to_sql(\"purposes\", conn, index=False, if_exists=\"replace\")\n", + "conn.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "4c24c47e-8f03-4b84-9647-6fd1559a4b0b", + "metadata": {}, + "outputs": [], + "source": [ + "import pyarrow as pa\n", + "import pyarrow.csv, pyarrow.parquet\n", + "\n", + "t = pa.parquet.read_table(\n", + " \"loans.parquet\", \n", + " columns=[\"lei\", \"action_taken\", \"loan_type\",\n", + " \"loan_amount\", \"interest_rate\", \"loan_purpose\", \"income\"\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "a0f419e7-b0d7-4bf1-94ad-4c4ca961ada6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "447367" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t.to_pandas().to_sql(\"loans\", conn, index=False,\n", + " if_exists=\"replace\", chunksize=10_000)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "96846d26-3aa8-414a-a925-eafa9fe60f50", + "metadata": {}, + "outputs": [], + "source": [ + "conn.commit()" + ] + }, + { + "cell_type": "markdown", + "id": "24e3be3e-5296-44fa-a850-c1e0df34cd38", + "metadata": {}, + "source": [ + "# Transactions" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "791ad3aa-0a41-4c5c-b1ee-6d9b5b95e023", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "remaining: -2\n", + "rollback!\n" + ] + } + ], + "source": [ + "conn.execute(text(\"\"\"\n", + "update accounts set amount = amount + 5 where name = 'B'\n", + "\"\"\"))\n", + "conn.execute(text(\"\"\"\n", + "update accounts set amount = amount - 5 where name = 'A'\n", + "\"\"\"))\n", + "\n", + "# invariant: account cannot go negative\n", + "remaining_amount = list(conn.execute(text(\n", + " \"select amount from accounts where name = 'A'\"\n", + ")))[0][0]\n", + "print(\"remaining:\", remaining_amount)\n", + "if remaining_amount >= 0:\n", + " print(\"commit!\")\n", + " conn.commit()\n", + "else:\n", + " print(\"rollback!\")\n", + " conn.rollback()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10f7d983-69fc-4800-b529-b7f19e0cdb73", + "metadata": {}, + "outputs": [], + "source": [ + "# conn.rollback() or conn.commit()" + ] + }, + { + "cell_type": "markdown", + "id": "49392cee-e7b9-4ce0-9ce3-961965443b3d", + "metadata": {}, + "source": [ + "# Analyze/Query the Data" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "ed5f92fc-ac76-4307-bd0c-6b714ed5a699", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id</th>\n", + " <th>action_taken</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>Loan originated</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>2</td>\n", + " <td>Application approved but not accepted</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>3</td>\n", + " <td>Application denied by financial institution</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4</td>\n", + " <td>Application withdrawn by applicant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>5</td>\n", + " <td>File closed for incompleteness</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>6</td>\n", + " <td>Loan purchased by the institution</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>7</td>\n", + " <td>Preapproval request denied by financial instit...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>8</td>\n", + " <td>Preapproval request approved but not accepted</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " id action_taken\n", + "0 1 Loan originated\n", + "1 2 Application approved but not accepted\n", + "2 3 Application denied by financial institution\n", + "3 4 Application withdrawn by applicant\n", + "4 5 File closed for incompleteness\n", + "5 6 Loan purchased by the institution\n", + "6 7 Preapproval request denied by financial instit...\n", + "7 8 Preapproval request approved but not accepted" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what are all the possible actions? Practice SELECT/FROM.\n", + "pd.read_sql(\"\"\"\n", + "SELECT *\n", + "FROM actions\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "8349be60-02f4-43bb-9cf2-568bc70d2c75", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>lei</th>\n", + " <th>action_taken</th>\n", + " <th>loan_type</th>\n", + " <th>loan_amount</th>\n", + " <th>interest_rate</th>\n", + " <th>loan_purpose</th>\n", + " <th>income</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>6</td>\n", + " <td>1</td>\n", + " <td>305000.0</td>\n", + " <td>3.875</td>\n", + " <td>1</td>\n", + " <td>108.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>4</td>\n", + " <td>1</td>\n", + " <td>65000.0</td>\n", + " <td>NA</td>\n", + " <td>1</td>\n", + " <td>103.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>6</td>\n", + " <td>1</td>\n", + " <td>75000.0</td>\n", + " <td>3.25</td>\n", + " <td>1</td>\n", + " <td>146.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>155000.0</td>\n", + " <td>4.0</td>\n", + " <td>32</td>\n", + " <td>70.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>305000.0</td>\n", + " <td>3.25</td>\n", + " <td>1</td>\n", + " <td>71.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>175000.0</td>\n", + " <td>3.375</td>\n", + " <td>1</td>\n", + " <td>117.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>575000.0</td>\n", + " <td>4.5</td>\n", + " <td>1</td>\n", + " <td>180.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>105000.0</td>\n", + " <td>5.375</td>\n", + " <td>1</td>\n", + " <td>180.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>85000.0</td>\n", + " <td>3.375</td>\n", + " <td>1</td>\n", + " <td>136.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>549300FQ2SN6TRRGB032</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>405000.0</td>\n", + " <td>Exempt</td>\n", + " <td>1</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " lei action_taken loan_type loan_amount interest_rate \\\n", + "0 54930034MNPILHP25H80 6 1 305000.0 3.875 \n", + "1 54930034MNPILHP25H80 4 1 65000.0 NA \n", + "2 54930034MNPILHP25H80 6 1 75000.0 3.25 \n", + "3 54930034MNPILHP25H80 1 1 155000.0 4.0 \n", + "4 54930034MNPILHP25H80 1 1 305000.0 3.25 \n", + "5 54930034MNPILHP25H80 1 1 175000.0 3.375 \n", + "6 54930034MNPILHP25H80 1 1 575000.0 4.5 \n", + "7 54930034MNPILHP25H80 1 1 105000.0 5.375 \n", + "8 54930034MNPILHP25H80 1 1 85000.0 3.375 \n", + "9 549300FQ2SN6TRRGB032 1 1 405000.0 Exempt \n", + "\n", + " loan_purpose income \n", + "0 1 108.0 \n", + "1 1 103.0 \n", + "2 1 146.0 \n", + "3 32 70.0 \n", + "4 1 71.0 \n", + "5 1 117.0 \n", + "6 1 180.0 \n", + "7 1 180.0 \n", + "8 1 136.0 \n", + "9 1 NaN " + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what are the first 10 loans? Practice LIMIT.\n", + "pd.read_sql(\"\"\"\n", + "SELECT *\n", + "FROM loans\n", + "LIMIT 10\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f99ed83c-d50c-43f7-bf3c-7307bb30801b", + "metadata": {}, + "outputs": [], + "source": [ + "# projection: choosing what columns (SELECT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2fe1388-b308-4f8c-8a63-c426c0f1b787", + "metadata": {}, + "outputs": [], + "source": [ + "# selection: filtering rows (WHERE)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "0da82d35-5e7f-481b-a3fe-9d828b541794", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>interest_rate</th>\n", + " <th>loan_thousands</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>3.875</td>\n", + " <td>305.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>NA</td>\n", + " <td>65.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>3.25</td>\n", + " <td>75.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4.0</td>\n", + " <td>155.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>3.25</td>\n", + " <td>305.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>3.375</td>\n", + " <td>175.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>4.5</td>\n", + " <td>575.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>5.375</td>\n", + " <td>105.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>3.375</td>\n", + " <td>85.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>Exempt</td>\n", + " <td>405.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " interest_rate loan_thousands\n", + "0 3.875 305.0\n", + "1 NA 65.0\n", + "2 3.25 75.0\n", + "3 4.0 155.0\n", + "4 3.25 305.0\n", + "5 3.375 175.0\n", + "6 4.5 575.0\n", + "7 5.375 105.0\n", + "8 3.375 85.0\n", + "9 Exempt 405.0" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what are the first 10 interest rates and loan amounts (in thousands)? Practice SELECT.\n", + "pd.read_sql(\"\"\"\n", + "SELECT interest_rate, loan_amount / 1000 AS loan_thousands\n", + "FROM loans\n", + "LIMIT 10\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "4ee4b0cb-7671-4570-9e18-fcd4b8c0394c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>lei</th>\n", + " <th>action_taken</th>\n", + " <th>loan_type</th>\n", + " <th>loan_amount</th>\n", + " <th>interest_rate</th>\n", + " <th>loan_purpose</th>\n", + " <th>income</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>254900IER2H3R8YLBW04</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>105000.0</td>\n", + " <td>2.875</td>\n", + " <td>31</td>\n", + " <td>1530000.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>3Y4U8VZURTYWI1W2K376</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>7455000.0</td>\n", + " <td>NA</td>\n", + " <td>4</td>\n", + " <td>94657029.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>549300CS1XP28EERR469</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>75000.0</td>\n", + " <td>4.99</td>\n", + " <td>4</td>\n", + " <td>2030000.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>549300CS1XP28EERR469</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>205000.0</td>\n", + " <td>3.75</td>\n", + " <td>1</td>\n", + " <td>7291000.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " lei action_taken loan_type loan_amount interest_rate \\\n", + "0 254900IER2H3R8YLBW04 1 1 105000.0 2.875 \n", + "1 3Y4U8VZURTYWI1W2K376 3 1 7455000.0 NA \n", + "2 549300CS1XP28EERR469 1 1 75000.0 4.99 \n", + "3 549300CS1XP28EERR469 1 1 205000.0 3.75 \n", + "\n", + " loan_purpose income \n", + "0 31 1530000.0 \n", + "1 4 94657029.0 \n", + "2 4 2030000.0 \n", + "3 1 7291000.0 " + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what are the loans for individuals with income over $1 million? Practice WHERE.\n", + "pd.read_sql(\"\"\"\n", + "SELECT *\n", + "FROM loans\n", + "WHERE income > 1000000\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "d64a2a5e-e2d4-42a3-a894-9283160d2636", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>lei</th>\n", + " <th>action_taken</th>\n", + " <th>loan_type</th>\n", + " <th>loan_amount</th>\n", + " <th>interest_rate</th>\n", + " <th>loan_purpose</th>\n", + " <th>income</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>549300XWUSRVVOHPRY47</td>\n", + " <td>6</td>\n", + " <td>1</td>\n", + " <td>264185000.0</td>\n", + " <td>NA</td>\n", + " <td>1</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>AD6GFRVSDT01YPT1CS68</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>74755000.0</td>\n", + " <td>1.454</td>\n", + " <td>1</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>AD6GFRVSDT01YPT1CS68</td>\n", + " <td>4</td>\n", + " <td>2</td>\n", + " <td>66005000.0</td>\n", + " <td>NA</td>\n", + " <td>1</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>YQI2CPR3Z44KAR0HG822</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>65005000.0</td>\n", + " <td>3.0</td>\n", + " <td>1</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>254900YA1AQXNM8QVZ06</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>63735000.0</td>\n", + " <td>2.99</td>\n", + " <td>2</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " lei action_taken loan_type loan_amount interest_rate \\\n", + "0 549300XWUSRVVOHPRY47 6 1 264185000.0 NA \n", + "1 AD6GFRVSDT01YPT1CS68 1 1 74755000.0 1.454 \n", + "2 AD6GFRVSDT01YPT1CS68 4 2 66005000.0 NA \n", + "3 YQI2CPR3Z44KAR0HG822 1 1 65005000.0 3.0 \n", + "4 254900YA1AQXNM8QVZ06 1 2 63735000.0 2.99 \n", + "\n", + " loan_purpose income \n", + "0 1 None \n", + "1 1 None \n", + "2 1 None \n", + "3 1 None \n", + "4 2 None " + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what are the five biggest loans in terms of dollar amount? Practice ORDER BY.\n", + "pd.read_sql(\"\"\"\n", + "SELECT *\n", + "FROM loans\n", + "ORDER BY loan_amount DESC\n", + "LIMIT 5\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "c01b796c-ded3-4e11-81e7-69d7660da9cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>action_taken</th>\n", + " <th>loan_type</th>\n", + " <th>lei</th>\n", + " <th>loan_amount</th>\n", + " <th>interest_rate</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Loan purchased by the institution</td>\n", + " <td>Conventional</td>\n", + " <td>549300XWUSRVVOHPRY47</td>\n", + " <td>264185000.0</td>\n", + " <td>NA</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Loan originated</td>\n", + " <td>Conventional</td>\n", + " <td>AD6GFRVSDT01YPT1CS68</td>\n", + " <td>74755000.0</td>\n", + " <td>1.454</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Application withdrawn by applicant</td>\n", + " <td>FHA-insured</td>\n", + " <td>AD6GFRVSDT01YPT1CS68</td>\n", + " <td>66005000.0</td>\n", + " <td>NA</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Loan originated</td>\n", + " <td>Conventional</td>\n", + " <td>YQI2CPR3Z44KAR0HG822</td>\n", + " <td>65005000.0</td>\n", + " <td>3.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Loan originated</td>\n", + " <td>FHA-insured</td>\n", + " <td>254900YA1AQXNM8QVZ06</td>\n", + " <td>63735000.0</td>\n", + " <td>2.99</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " action_taken loan_type lei \\\n", + "0 Loan purchased by the institution Conventional 549300XWUSRVVOHPRY47 \n", + "1 Loan originated Conventional AD6GFRVSDT01YPT1CS68 \n", + "2 Application withdrawn by applicant FHA-insured AD6GFRVSDT01YPT1CS68 \n", + "3 Loan originated Conventional YQI2CPR3Z44KAR0HG822 \n", + "4 Loan originated FHA-insured 254900YA1AQXNM8QVZ06 \n", + "\n", + " loan_amount interest_rate \n", + "0 264185000.0 NA \n", + "1 74755000.0 1.454 \n", + "2 66005000.0 NA \n", + "3 65005000.0 3.0 \n", + "4 63735000.0 2.99 " + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what are the actions taken and types for those loans (show the text, not numbers)? Practice INNER JOIN.\n", + "pd.read_sql(\"\"\"\n", + "SELECT actions.action_taken, loan_types.loan_type, loans.lei, loans.loan_amount, loans.interest_rate\n", + "FROM loans\n", + "INNER JOIN actions ON loans.action_taken = actions.id\n", + "INNER JOIN loan_types ON loans.loan_type = loan_types.id\n", + "ORDER BY loan_amount DESC\n", + "LIMIT 5\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "b7d4a687-70bf-46e7-a715-013977358d05", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>lei</th>\n", + " <th>action_taken</th>\n", + " <th>loan_type</th>\n", + " <th>loan_amount</th>\n", + " <th>interest_rate</th>\n", + " <th>loan_purpose</th>\n", + " <th>income</th>\n", + " <th>id</th>\n", + " <th>loan_purpose</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>3</td>\n", + " <td>Refinancing</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " lei action_taken loan_type loan_amount interest_rate loan_purpose income \\\n", + "0 None None None None None None None \n", + "\n", + " id loan_purpose \n", + "0 3 Refinancing " + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what is a loan_purpose that doesn't appear in the loans table? Practice LEFT/RIGHT JOIN.\n", + "pd.read_sql(\"\"\"\n", + "SELECT *\n", + "FROM loans\n", + "RIGHT JOIN purposes ON loans.loan_purpose = purposes.id\n", + "WHERE loans.loan_purpose IS NULL\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc73517c-cf57-4a91-bb9d-2fbfc76544d5", + "metadata": {}, + "outputs": [], + "source": [ + "# how many rows are in the table? Practice COUNT(*)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e91feeee-8689-4f57-a991-f6ca3fee2a6d", + "metadata": {}, + "outputs": [], + "source": [ + "# how many non-null values are in the income column? Practice COUNT(column)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12333532-0618-4ed4-b71b-260a4f35e581", + "metadata": {}, + "outputs": [], + "source": [ + "# what is the average interest rate for loans of type \"Conventional\"? Practice AVG." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23c00af3-e385-435a-8bf6-f5cfe64f02db", + "metadata": {}, + "outputs": [], + "source": [ + "# how many loans are there of each type? Practice GROUP BY." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2400a6a4-7056-47e0-b202-3bbb85a77b2f", + "metadata": {}, + "outputs": [], + "source": [ + "# which loan types appear at least 10,000 times? Practice HAVING." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lec/15-sql/lec2.ipynb b/lec/15-sql/lec2.ipynb new file mode 100644 index 0000000..6fa2118 --- /dev/null +++ b/lec/15-sql/lec2.ipynb @@ -0,0 +1,1505 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "34c3d038-ece1-448a-8613-9f950fd70fb2", + "metadata": {}, + "outputs": [], + "source": [ + "from sqlalchemy import create_engine, text\n", + "engine = create_engine(\"mysql+mysqlconnector://root:abc@127.0.0.1:3306/cs544\")\n", + "conn = engine.connect()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d8c59b65-c21d-4853-b54a-77364d8009ad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(conn.execute(text(\"show tables\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "cad5f150-10c2-4d87-a45e-ac21622862db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<sqlalchemy.engine.cursor.CursorResult at 0x7696940966d0>" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# table: users\n", + "# columns: id, name, phone\n", + "# name is required\n", + "# id uniquely identifies row\n", + "conn.execute(text(\"\"\"\n", + " create table users (\n", + " id int,\n", + " name text NOT NULL,\n", + " phone text,\n", + " PRIMARY KEY (id)\n", + " )\n", + "\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7c53fd18-7db7-419a-a884-f5a73de25b6f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<sqlalchemy.engine.cursor.CursorResult at 0x769694096cf0>" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute(text(\"\"\"\n", + " create table accounts (\n", + " user_id int,\n", + " name text NOT NULL,\n", + " amount int NOT NULL,\n", + " FOREIGN KEY (user_id) REFERENCES users (id)\n", + " )\n", + "\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7f694879-f48c-4336-89fb-03c2b423cdf2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('accounts',), ('users',)]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(conn.execute(text(\"show tables\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4a0f1e2c-0e17-41bb-ab9c-49c93351f18e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<sqlalchemy.engine.cursor.CursorResult at 0x769694096970>" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute(text(\"\"\"\n", + "INSERT INTO users (id, name) VALUES (1, \"tyler\")\n", + "\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a2e34483-f7f0-4659-91db-d587f08de40b", + "metadata": {}, + "outputs": [], + "source": [ + "# conn.execute(text(\"\"\"\n", + "# INSERT INTO users (id, name) VALUES (1, \"tyler\")\n", + "# \"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "88ff233c-ff1b-4d4c-8060-28b216898734", + "metadata": {}, + "outputs": [], + "source": [ + "conn.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7b93f8cb-bc35-49bd-be82-7c48af5d2f56", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<sqlalchemy.engine.cursor.CursorResult at 0x76969c5c8e50>" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn.execute(text(\"\"\"\n", + "INSERT INTO accounts (user_id, name, amount) VALUES (1, \"A\", 10), (1, \"B\", 20)\n", + "\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "4cd70a0c-ca3b-44bb-aacc-8dc95ad02d29", + "metadata": {}, + "outputs": [], + "source": [ + "conn.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f77e36c7-0ee3-4dd8-80f5-10321c1c78cd", + "metadata": {}, + "outputs": [], + "source": [ + "# conn.execute(text(\"\"\"\n", + "# DELETE FROM users WHERE id = 1\n", + "# \"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "a0862124-c44a-42bb-a63d-d160dd1312f4", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "url = \"https://raw.githubusercontent.com/cfpb/api/master/resources/datasets/hmda/code_sheets/\"\n", + "df = pd.read_csv(url + \"action_taken.csv\")\n", + "df.to_sql(\"actions\", conn, index=False, if_exists=\"replace\")\n", + "df = pd.read_csv(url + \"loan_type.csv\")\n", + "df.to_sql(\"loan_types\", conn, index=False, if_exists=\"replace\")\n", + "df = pd.read_csv(url + \"loan_purpose.csv\")\n", + "df.to_sql(\"purposes\", conn, index=False, if_exists=\"replace\")\n", + "conn.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "3e6ac1cc-5f2b-4c43-a3e7-862713ff66f0", + "metadata": {}, + "outputs": [], + "source": [ + "import pyarrow as pa\n", + "import pyarrow.parquet\n", + "t = pa.parquet.read_table(\n", + " \"loans.parquet\", \n", + " columns=[\"lei\", \"action_taken\", \"loan_type\", \"loan_amount\",\n", + " \"interest_rate\", \"loan_purpose\", \"income\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "504b747e-7b4c-4528-845a-32a03a9e2a8e", + "metadata": {}, + "outputs": [], + "source": [ + "t.to_pandas().to_sql(\"loans\", conn, index=False, if_exists=\"replace\", chunksize=10_000)\n", + "conn.commit()" + ] + }, + { + "cell_type": "markdown", + "id": "1fc118ae-84c9-423b-8038-c0d44bc9a443", + "metadata": {}, + "source": [ + "# Transactions" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "4cb87a68-c82a-404e-bf43-1360753aba41", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "remaining: -1\n", + "rollback!\n" + ] + } + ], + "source": [ + "conn.execute(text(\"\"\"\n", + "update accounts set amount = amount - 5 WHERE name = 'A'\n", + "\"\"\"))\n", + "\n", + "conn.execute(text(\"\"\"\n", + "update accounts set amount = amount + 5 WHERE name = 'B'\n", + "\"\"\"))\n", + "\n", + "remaining_amount = list(conn.execute(text(\"\"\"\n", + "select amount from accounts WHERE name = 'A'\n", + "\"\"\")))[0][0]\n", + "print(\"remaining:\", remaining_amount)\n", + "\n", + "if remaining_amount >= 0:\n", + " print(\"commit!\")\n", + " conn.commit()\n", + "else:\n", + " print(\"rollback!\")\n", + " conn.rollback()" + ] + }, + { + "cell_type": "markdown", + "id": "019705c1-82e6-4434-9359-0843d6b2d8c5", + "metadata": {}, + "source": [ + "# Analysis/SQL Queries" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "35290388-dbfe-451f-94ba-d9ef8dee4e00", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id</th>\n", + " <th>action_taken</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>Loan originated</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>2</td>\n", + " <td>Application approved but not accepted</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>3</td>\n", + " <td>Application denied by financial institution</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4</td>\n", + " <td>Application withdrawn by applicant</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>5</td>\n", + " <td>File closed for incompleteness</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>6</td>\n", + " <td>Loan purchased by the institution</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>7</td>\n", + " <td>Preapproval request denied by financial instit...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>8</td>\n", + " <td>Preapproval request approved but not accepted</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " id action_taken\n", + "0 1 Loan originated\n", + "1 2 Application approved but not accepted\n", + "2 3 Application denied by financial institution\n", + "3 4 Application withdrawn by applicant\n", + "4 5 File closed for incompleteness\n", + "5 6 Loan purchased by the institution\n", + "6 7 Preapproval request denied by financial instit...\n", + "7 8 Preapproval request approved but not accepted" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what are all the possible actions? Practice SELECT/FROM.\n", + "pd.read_sql(\"\"\"\n", + "SELECT *\n", + "FROM actions\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "cdbedc10-8d92-4c1a-83aa-9c604e447d1e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>lei</th>\n", + " <th>action_taken</th>\n", + " <th>loan_type</th>\n", + " <th>loan_amount</th>\n", + " <th>interest_rate</th>\n", + " <th>loan_purpose</th>\n", + " <th>income</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>6</td>\n", + " <td>1</td>\n", + " <td>305000.0</td>\n", + " <td>3.875</td>\n", + " <td>1</td>\n", + " <td>108.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>4</td>\n", + " <td>1</td>\n", + " <td>65000.0</td>\n", + " <td>NA</td>\n", + " <td>1</td>\n", + " <td>103.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>6</td>\n", + " <td>1</td>\n", + " <td>75000.0</td>\n", + " <td>3.25</td>\n", + " <td>1</td>\n", + " <td>146.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>155000.0</td>\n", + " <td>4.0</td>\n", + " <td>32</td>\n", + " <td>70.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>305000.0</td>\n", + " <td>3.25</td>\n", + " <td>1</td>\n", + " <td>71.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>175000.0</td>\n", + " <td>3.375</td>\n", + " <td>1</td>\n", + " <td>117.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>575000.0</td>\n", + " <td>4.5</td>\n", + " <td>1</td>\n", + " <td>180.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>105000.0</td>\n", + " <td>5.375</td>\n", + " <td>1</td>\n", + " <td>180.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>54930034MNPILHP25H80</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>85000.0</td>\n", + " <td>3.375</td>\n", + " <td>1</td>\n", + " <td>136.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>549300FQ2SN6TRRGB032</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>405000.0</td>\n", + " <td>Exempt</td>\n", + " <td>1</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " lei action_taken loan_type loan_amount interest_rate \\\n", + "0 54930034MNPILHP25H80 6 1 305000.0 3.875 \n", + "1 54930034MNPILHP25H80 4 1 65000.0 NA \n", + "2 54930034MNPILHP25H80 6 1 75000.0 3.25 \n", + "3 54930034MNPILHP25H80 1 1 155000.0 4.0 \n", + "4 54930034MNPILHP25H80 1 1 305000.0 3.25 \n", + "5 54930034MNPILHP25H80 1 1 175000.0 3.375 \n", + "6 54930034MNPILHP25H80 1 1 575000.0 4.5 \n", + "7 54930034MNPILHP25H80 1 1 105000.0 5.375 \n", + "8 54930034MNPILHP25H80 1 1 85000.0 3.375 \n", + "9 549300FQ2SN6TRRGB032 1 1 405000.0 Exempt \n", + "\n", + " loan_purpose income \n", + "0 1 108.0 \n", + "1 1 103.0 \n", + "2 1 146.0 \n", + "3 32 70.0 \n", + "4 1 71.0 \n", + "5 1 117.0 \n", + "6 1 180.0 \n", + "7 1 180.0 \n", + "8 1 136.0 \n", + "9 1 NaN " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what are the first 10 loans? Practice LIMIT.\n", + "pd.read_sql(\"\"\"\n", + "SELECT *\n", + "FROM loans\n", + "LIMIT 10\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cc63602-f5d1-4502-b955-54d504dfcb40", + "metadata": {}, + "outputs": [], + "source": [ + "# projection: choosing what columns (SELECT)\n", + "# selection: filtering rows (WHERE)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "c0eb9c38-ff94-486a-b34e-e241d8f69d01", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>interest_rate</th>\n", + " <th>amount_in_thousands</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>3.875</td>\n", + " <td>305.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>NA</td>\n", + " <td>65.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>3.25</td>\n", + " <td>75.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4.0</td>\n", + " <td>155.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>3.25</td>\n", + " <td>305.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>3.375</td>\n", + " <td>175.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>4.5</td>\n", + " <td>575.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>5.375</td>\n", + " <td>105.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>3.375</td>\n", + " <td>85.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>Exempt</td>\n", + " <td>405.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " interest_rate amount_in_thousands\n", + "0 3.875 305.0\n", + "1 NA 65.0\n", + "2 3.25 75.0\n", + "3 4.0 155.0\n", + "4 3.25 305.0\n", + "5 3.375 175.0\n", + "6 4.5 575.0\n", + "7 5.375 105.0\n", + "8 3.375 85.0\n", + "9 Exempt 405.0" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what are the first 10 interest rates and loan amounts (in thousands)? Practice SELECT.\n", + "pd.read_sql(\"\"\"\n", + "SELECT interest_rate, loan_amount / 1000 AS amount_in_thousands\n", + "FROM loans\n", + "LIMIT 10\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "29f2e695-fc92-412c-bc1c-5c06f4200d25", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>lei</th>\n", + " <th>action_taken</th>\n", + " <th>loan_type</th>\n", + " <th>loan_amount</th>\n", + " <th>interest_rate</th>\n", + " <th>loan_purpose</th>\n", + " <th>income</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>254900IER2H3R8YLBW04</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>105000.0</td>\n", + " <td>2.875</td>\n", + " <td>31</td>\n", + " <td>1530000.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>3Y4U8VZURTYWI1W2K376</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>7455000.0</td>\n", + " <td>NA</td>\n", + " <td>4</td>\n", + " <td>94657029.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>549300CS1XP28EERR469</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>75000.0</td>\n", + " <td>4.99</td>\n", + " <td>4</td>\n", + " <td>2030000.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>549300CS1XP28EERR469</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>205000.0</td>\n", + " <td>3.75</td>\n", + " <td>1</td>\n", + " <td>7291000.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " lei action_taken loan_type loan_amount interest_rate \\\n", + "0 254900IER2H3R8YLBW04 1 1 105000.0 2.875 \n", + "1 3Y4U8VZURTYWI1W2K376 3 1 7455000.0 NA \n", + "2 549300CS1XP28EERR469 1 1 75000.0 4.99 \n", + "3 549300CS1XP28EERR469 1 1 205000.0 3.75 \n", + "\n", + " loan_purpose income \n", + "0 31 1530000.0 \n", + "1 4 94657029.0 \n", + "2 4 2030000.0 \n", + "3 1 7291000.0 " + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what are the loans for individuals with income over $1 million? Practice WHERE.\n", + "pd.read_sql(\"\"\"\n", + "SELECT *\n", + "FROM loans\n", + "WHERE income > 1000000\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "b0808413-c09d-4d5f-9c03-b7f5747d3205", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>lei</th>\n", + " <th>action_taken</th>\n", + " <th>loan_type</th>\n", + " <th>loan_amount</th>\n", + " <th>interest_rate</th>\n", + " <th>loan_purpose</th>\n", + " <th>income</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>549300XWUSRVVOHPRY47</td>\n", + " <td>6</td>\n", + " <td>1</td>\n", + " <td>264185000.0</td>\n", + " <td>NA</td>\n", + " <td>1</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>AD6GFRVSDT01YPT1CS68</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>74755000.0</td>\n", + " <td>1.454</td>\n", + " <td>1</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>AD6GFRVSDT01YPT1CS68</td>\n", + " <td>4</td>\n", + " <td>2</td>\n", + " <td>66005000.0</td>\n", + " <td>NA</td>\n", + " <td>1</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>YQI2CPR3Z44KAR0HG822</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>65005000.0</td>\n", + " <td>3.0</td>\n", + " <td>1</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>254900YA1AQXNM8QVZ06</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>63735000.0</td>\n", + " <td>2.99</td>\n", + " <td>2</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " lei action_taken loan_type loan_amount interest_rate \\\n", + "0 549300XWUSRVVOHPRY47 6 1 264185000.0 NA \n", + "1 AD6GFRVSDT01YPT1CS68 1 1 74755000.0 1.454 \n", + "2 AD6GFRVSDT01YPT1CS68 4 2 66005000.0 NA \n", + "3 YQI2CPR3Z44KAR0HG822 1 1 65005000.0 3.0 \n", + "4 254900YA1AQXNM8QVZ06 1 2 63735000.0 2.99 \n", + "\n", + " loan_purpose income \n", + "0 1 None \n", + "1 1 None \n", + "2 1 None \n", + "3 1 None \n", + "4 2 None " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what are the five biggest loans in terms of dollar amount? Practice ORDER BY.\n", + "pd.read_sql(\"\"\"\n", + "SELECT *\n", + "FROM loans\n", + "ORDER BY loan_amount DESC\n", + "LIMIT 5\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "77bc2433-5cdb-4134-a75a-6ddbf5e34661", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>action_taken</th>\n", + " <th>loan_type</th>\n", + " <th>lei</th>\n", + " <th>action_taken</th>\n", + " <th>loan_type</th>\n", + " <th>loan_amount</th>\n", + " <th>interest_rate</th>\n", + " <th>loan_purpose</th>\n", + " <th>income</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Loan purchased by the institution</td>\n", + " <td>Conventional</td>\n", + " <td>549300XWUSRVVOHPRY47</td>\n", + " <td>6</td>\n", + " <td>1</td>\n", + " <td>264185000.0</td>\n", + " <td>NA</td>\n", + " <td>1</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Loan originated</td>\n", + " <td>Conventional</td>\n", + " <td>AD6GFRVSDT01YPT1CS68</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>74755000.0</td>\n", + " <td>1.454</td>\n", + " <td>1</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Application withdrawn by applicant</td>\n", + " <td>FHA-insured</td>\n", + " <td>AD6GFRVSDT01YPT1CS68</td>\n", + " <td>4</td>\n", + " <td>2</td>\n", + " <td>66005000.0</td>\n", + " <td>NA</td>\n", + " <td>1</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Loan originated</td>\n", + " <td>Conventional</td>\n", + " <td>YQI2CPR3Z44KAR0HG822</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>65005000.0</td>\n", + " <td>3.0</td>\n", + " <td>1</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Loan originated</td>\n", + " <td>FHA-insured</td>\n", + " <td>254900YA1AQXNM8QVZ06</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>63735000.0</td>\n", + " <td>2.99</td>\n", + " <td>2</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " action_taken loan_type lei \\\n", + "0 Loan purchased by the institution Conventional 549300XWUSRVVOHPRY47 \n", + "1 Loan originated Conventional AD6GFRVSDT01YPT1CS68 \n", + "2 Application withdrawn by applicant FHA-insured AD6GFRVSDT01YPT1CS68 \n", + "3 Loan originated Conventional YQI2CPR3Z44KAR0HG822 \n", + "4 Loan originated FHA-insured 254900YA1AQXNM8QVZ06 \n", + "\n", + " action_taken loan_type loan_amount interest_rate loan_purpose income \n", + "0 6 1 264185000.0 NA 1 None \n", + "1 1 1 74755000.0 1.454 1 None \n", + "2 4 2 66005000.0 NA 1 None \n", + "3 1 1 65005000.0 3.0 1 None \n", + "4 1 2 63735000.0 2.99 2 None " + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what are the actions taken and types for those loans (show the text, not numbers)? Practice INNER JOIN.\n", + "pd.read_sql(\"\"\"\n", + "SELECT actions.action_taken, loan_types.loan_type, loans.*\n", + "FROM loans\n", + "INNER JOIN actions ON loans.action_taken = actions.id\n", + "INNER JOIN loan_types ON loans.loan_type = loan_types.id\n", + "ORDER BY loan_amount DESC\n", + "LIMIT 5\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "fe7e1b59-644c-4b12-8c1e-38dce7fdd53b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>lei</th>\n", + " <th>action_taken</th>\n", + " <th>loan_type</th>\n", + " <th>loan_amount</th>\n", + " <th>interest_rate</th>\n", + " <th>loan_purpose</th>\n", + " <th>income</th>\n", + " <th>id</th>\n", + " <th>loan_purpose</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>3</td>\n", + " <td>Refinancing</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " lei action_taken loan_type loan_amount interest_rate loan_purpose income \\\n", + "0 None None None None None None None \n", + "\n", + " id loan_purpose \n", + "0 3 Refinancing " + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what is a loan_purpose that doesn't appear in the loans table? Practice LEFT/RIGHT JOIN.\n", + "pd.read_sql(\"\"\"\n", + "SELECT *\n", + "FROM loans\n", + "RIGHT JOIN purposes ON loans.loan_purpose = purposes.id\n", + "WHERE loans.loan_purpose IS NULL\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "b0ce8f29-49ef-4e1c-9f78-e42e4242f7b6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>COUNT(*)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>447367</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " COUNT(*)\n", + "0 447367" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many rows are in the table? Practice COUNT(*).\n", + "pd.read_sql(\"\"\"\n", + "SELECT COUNT(*)\n", + "FROM loans\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "17652759-6909-47e7-86d9-efd91c374cf3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>COUNT(income)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>399948</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " COUNT(income)\n", + "0 399948" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many non-null values are in the income column? Practice COUNT(column).\n", + "pd.read_sql(\"\"\"\n", + "SELECT COUNT(income)\n", + "FROM loans\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "6c029b7b-5b29-4be9-be5b-83ee246e5f30", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>AVG(interest_rate)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>2.21657</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " AVG(interest_rate)\n", + "0 2.21657" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what is the average interest rate for loans of type \"Conventional\"? Practice AVG.\n", + "pd.read_sql(\"\"\"\n", + "SELECT AVG(interest_rate)\n", + "FROM loans\n", + "INNER JOIN loan_types ON loans.loan_type = loan_types.id\n", + "WHERE loan_types.loan_type = \"Conventional\"\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "40d65b3d-ed6f-4318-8e95-b03dcc668d01", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>loan_type</th>\n", + " <th>COUNT(*)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Conventional</td>\n", + " <td>389217</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>VA-guaranteed</td>\n", + " <td>24551</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>FHA-insured</td>\n", + " <td>30496</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>FSA/RHS-guaranteed</td>\n", + " <td>3103</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " loan_type COUNT(*)\n", + "0 Conventional 389217\n", + "1 VA-guaranteed 24551\n", + "2 FHA-insured 30496\n", + "3 FSA/RHS-guaranteed 3103" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many loans are there of each type? Practice GROUP BY.\n", + "pd.read_sql(\"\"\"\n", + "SELECT loan_types.loan_type, COUNT(*)\n", + "FROM loans\n", + "INNER JOIN loan_types ON loans.loan_type = loan_types.id\n", + "GROUP BY loan_types.loan_type\n", + "\"\"\", conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "c1bf9134-2694-44e4-9a38-bcb5b51fa2e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>loan_type</th>\n", + " <th>count</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Conventional</td>\n", + " <td>389217</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>VA-guaranteed</td>\n", + " <td>24551</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>FHA-insured</td>\n", + " <td>30496</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " loan_type count\n", + "0 Conventional 389217\n", + "1 VA-guaranteed 24551\n", + "2 FHA-insured 30496" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# which loan types appear at least 10,000 times? Practice HAVING.\n", + "# how many loans are there of each type? Practice GROUP BY.\n", + "pd.read_sql(\"\"\"\n", + "SELECT loan_types.loan_type, COUNT(*) AS count\n", + "FROM loans\n", + "INNER JOIN loan_types ON loans.loan_type = loan_types.idGROUP BY loan_types.loan_type\n", + "HAVING count >= 10000\n", + "\"\"\", conn)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab