From c99c65b5efdaa91f8fce8fadffc41c4747e0a3a0 Mon Sep 17 00:00:00 2001 From: gsingh58 <gurmail-singh@wisc.edu> Date: Thu, 7 Mar 2024 08:36:02 -0600 Subject: [PATCH] lec13 updated --- lecture_material/13-web-3/solution.ipynb | 177 +++++++++++++++++- .../13-web-3/template_lec_001.ipynb | 93 ++++++++- .../13-web-3/template_lec_002.ipynb | 91 ++++++++- 3 files changed, 353 insertions(+), 8 deletions(-) diff --git a/lecture_material/13-web-3/solution.ipynb b/lecture_material/13-web-3/solution.ipynb index bbab494..0b53ae2 100644 --- a/lecture_material/13-web-3/solution.ipynb +++ b/lecture_material/13-web-3/solution.ipynb @@ -5,7 +5,7 @@ "id": "cf313adf", "metadata": {}, "source": [ - "# Web 3: More Flask" + "# Web 3: More Flask and A/B testing" ] }, { @@ -17,7 +17,11 @@ "source": [ "import requests\n", "import time\n", - "import urllib.robotparser" + "import urllib.robotparser\n", + "\n", + "import pandas as pd\n", + "# new import statement: requires pip3 install scipy\n", + "from scipy import stats" ] }, { @@ -89,7 +93,7 @@ { "data": { "text/plain": [ - "False" + "True" ] }, "execution_count": 4, @@ -134,6 +138,173 @@ " \n", "friendly_get(base_url + \"slow\").text" ] + }, + { + "cell_type": "markdown", + "id": "1dbd2fad-1bf5-437f-9ebb-6694ba860e27", + "metadata": {}, + "source": [ + "# A/B testing" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6922661b-af1b-41d5-bcb0-f2759e890f3e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>click</th>\n", + " <th>no-click</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>A</th>\n", + " <td>50</td>\n", + " <td>50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>B</th>\n", + " <td>55</td>\n", + " <td>45</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " click no-click\n", + "A 50 50\n", + "B 55 45" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({\n", + " \"click\": {\"A\": 50, \"B\": 55},\n", + " \"no-click\": {\"A\": 50, \"B\": 45}\n", + "})\n", + "df\n", + "# Which has the higher CTR A or B?" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "31b0de1b-f7fb-42ee-a087-3d884cc31590", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.5712421394829712" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "_, pvalue = stats.fisher_exact(df)\n", + "pvalue \n", + "# no evidence that A and B are difference because pvalue is not less than 5%" + ] + }, + { + "cell_type": "markdown", + "id": "5dde0ee6-b03e-4bbf-b896-1794a2bff610", + "metadata": {}, + "source": [ + "### Two situations when pvalue will be lower than significance threshold\n", + "\n", + "1. Sample size is the same, but skew is very heavy --- unlikely to have that by chance\n", + "2. Sample size is large, but skew is small " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b8719b9e-f366-47a0-9d96-50b4fdd8fb27", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.00042033045869994034" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Scenario 1: \n", + "# Sample size is the same, but skew is very heavy --- \n", + "# unlikely to have that by chance\n", + "\n", + "df = pd.DataFrame({\n", + " \"click\": {\"A\": 50, \"B\": 75},\n", + " \"no-click\": {\"A\": 50, \"B\": 25}\n", + "})\n", + "_, pvalue = stats.fisher_exact(df)\n", + "pvalue" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "72a18c43-0f07-4e7f-bb88-89a43563f295", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.02820356890423392" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Scenario 2: \n", + "# Sample size is large, but skew is small \n", + "\n", + "df = pd.DataFrame({\n", + " \"click\": {\"A\": 500, \"B\": 550},\n", + " \"no-click\": {\"A\": 500, \"B\": 450}\n", + "})\n", + "_, pvalue = stats.fisher_exact(df)\n", + "pvalue" + ] } ], "metadata": { diff --git a/lecture_material/13-web-3/template_lec_001.ipynb b/lecture_material/13-web-3/template_lec_001.ipynb index 2b3de51..58a8c2f 100644 --- a/lecture_material/13-web-3/template_lec_001.ipynb +++ b/lecture_material/13-web-3/template_lec_001.ipynb @@ -5,7 +5,7 @@ "id": "cf313adf", "metadata": {}, "source": [ - "# Web 3: More Flask" + "# Web 3: More Flask and A/B testing" ] }, { @@ -16,7 +16,11 @@ "outputs": [], "source": [ "import requests\n", - "import time\n" + "import time\n", + "\n", + "import pandas as pd\n", + "# new import statement: requires pip3 install scipy\n", + "from scipy import stats" ] }, { @@ -74,7 +78,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6cc81b85", + "id": "0eb41e31-490e-419b-af15-db66e0dd6fd1", "metadata": {}, "outputs": [], "source": [ @@ -91,6 +95,89 @@ " \n", "friendly_get(base_url + \"slow\").text" ] + }, + { + "cell_type": "markdown", + "id": "fcfa857a-10e7-4e74-adcc-9c4d384e9986", + "metadata": {}, + "source": [ + "# A/B testing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4839880-5642-49ce-b1cf-938ddf409229", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame({\n", + " \"click\": {\"A\": 50, \"B\": 55},\n", + " \"no-click\": {\"A\": 50, \"B\": 45}\n", + "})\n", + "df\n", + "# Which has the higher CTR A or B?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "341e9430-d7c2-4cba-a2f1-b5fb95a1efa6", + "metadata": {}, + "outputs": [], + "source": [ + "_, pvalue = stats.fisher_exact(df)\n", + "pvalue \n", + "# no evidence that A and B are difference because pvalue is not less than 5%" + ] + }, + { + "cell_type": "markdown", + "id": "3fa3b996-ef70-4492-8221-dbe597af8541", + "metadata": {}, + "source": [ + "### Two situations when pvalue will be lower than significance threshold\n", + "\n", + "1. Sample size is the same, but skew is very heavy --- unlikely to have that by chance\n", + "2. Sample size is large, but skew is small " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "245ac3cf-a7b4-4bd4-81d8-caf7fc4897fe", + "metadata": {}, + "outputs": [], + "source": [ + "# Scenario 1: \n", + "# Sample size is the same, but skew is very heavy --- \n", + "# unlikely to have that by chance\n", + "\n", + "df = pd.DataFrame({\n", + " \"click\": {\"A\": 50, \"B\": 75},\n", + " \"no-click\": {\"A\": 50, \"B\": 25}\n", + "})\n", + "_, pvalue = stats.fisher_exact(df)\n", + "pvalue" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3a1b056-3315-494d-9667-3bfa5ebfa863", + "metadata": {}, + "outputs": [], + "source": [ + "# Scenario 2: \n", + "# Sample size is large, but skew is small \n", + "\n", + "df = pd.DataFrame({\n", + " \"click\": {\"A\": 500, \"B\": 550},\n", + " \"no-click\": {\"A\": 500, \"B\": 450}\n", + "})\n", + "_, pvalue = stats.fisher_exact(df)\n", + "pvalue" + ] } ], "metadata": { diff --git a/lecture_material/13-web-3/template_lec_002.ipynb b/lecture_material/13-web-3/template_lec_002.ipynb index e47bb0e..24641ed 100644 --- a/lecture_material/13-web-3/template_lec_002.ipynb +++ b/lecture_material/13-web-3/template_lec_002.ipynb @@ -5,7 +5,7 @@ "id": "cf313adf", "metadata": {}, "source": [ - "# Web 3: More Flask" + "# Web 3: More Flask and A/B testing" ] }, { @@ -16,7 +16,11 @@ "outputs": [], "source": [ "import requests\n", - "import time\n" + "import time\n", + "\n", + "import pandas as pd\n", + "# new import statement: requires pip3 install scipy\n", + "from scipy import stats" ] }, { @@ -105,6 +109,89 @@ " \n", "friendly_get(base_url + \"slow\").text" ] + }, + { + "cell_type": "markdown", + "id": "b2353748-f9a6-4de1-80f0-d5a398df5783", + "metadata": {}, + "source": [ + "# A/B testing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcc93502-38f4-4e4a-b7b4-e4c868492aa3", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame({\n", + " \"click\": {\"A\": 50, \"B\": 55},\n", + " \"no-click\": {\"A\": 50, \"B\": 45}\n", + "})\n", + "df\n", + "# Which has the higher CTR A or B?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30438fd0-88fd-4f81-b2b7-dcb96679a489", + "metadata": {}, + "outputs": [], + "source": [ + "_, pvalue = stats.fisher_exact(df)\n", + "pvalue \n", + "# no evidence that A and B are difference because pvalue is not less than 5%" + ] + }, + { + "cell_type": "markdown", + "id": "204bb86d-4504-4a15-88d0-9c6b8189c23c", + "metadata": {}, + "source": [ + "### Two situations when pvalue will be lower than significance threshold\n", + "\n", + "1. Sample size is the same, but skew is very heavy --- unlikely to have that by chance\n", + "2. Sample size is large, but skew is small " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74484f80-8dd4-4035-a329-157e1389084b", + "metadata": {}, + "outputs": [], + "source": [ + "# Scenario 1: \n", + "# Sample size is the same, but skew is very heavy --- \n", + "# unlikely to have that by chance\n", + "\n", + "df = pd.DataFrame({\n", + " \"click\": {\"A\": 50, \"B\": 75},\n", + " \"no-click\": {\"A\": 50, \"B\": 25}\n", + "})\n", + "_, pvalue = stats.fisher_exact(df)\n", + "pvalue" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "938738ca-2db3-4408-b4e4-2571b89084ee", + "metadata": {}, + "outputs": [], + "source": [ + "# Scenario 2: \n", + "# Sample size is large, but skew is small \n", + "\n", + "df = pd.DataFrame({\n", + " \"click\": {\"A\": 500, \"B\": 550},\n", + " \"no-click\": {\"A\": 500, \"B\": 450}\n", + "})\n", + "_, pvalue = stats.fisher_exact(df)\n", + "pvalue" + ] } ], "metadata": { -- GitLab