From c99c65b5efdaa91f8fce8fadffc41c4747e0a3a0 Mon Sep 17 00:00:00 2001
From: gsingh58 <gurmail-singh@wisc.edu>
Date: Thu, 7 Mar 2024 08:36:02 -0600
Subject: [PATCH] lec13 updated

---
 lecture_material/13-web-3/solution.ipynb      | 177 +++++++++++++++++-
 .../13-web-3/template_lec_001.ipynb           |  93 ++++++++-
 .../13-web-3/template_lec_002.ipynb           |  91 ++++++++-
 3 files changed, 353 insertions(+), 8 deletions(-)

diff --git a/lecture_material/13-web-3/solution.ipynb b/lecture_material/13-web-3/solution.ipynb
index bbab494..0b53ae2 100644
--- a/lecture_material/13-web-3/solution.ipynb
+++ b/lecture_material/13-web-3/solution.ipynb
@@ -5,7 +5,7 @@
    "id": "cf313adf",
    "metadata": {},
    "source": [
-    "# Web 3: More Flask"
+    "# Web 3: More Flask and A/B testing"
    ]
   },
   {
@@ -17,7 +17,11 @@
    "source": [
     "import requests\n",
     "import time\n",
-    "import urllib.robotparser"
+    "import urllib.robotparser\n",
+    "\n",
+    "import pandas as pd\n",
+    "# new import statement: requires pip3 install scipy\n",
+    "from scipy import stats"
    ]
   },
   {
@@ -89,7 +93,7 @@
     {
      "data": {
       "text/plain": [
-       "False"
+       "True"
       ]
      },
      "execution_count": 4,
@@ -134,6 +138,173 @@
     "    \n",
     "friendly_get(base_url + \"slow\").text"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1dbd2fad-1bf5-437f-9ebb-6694ba860e27",
+   "metadata": {},
+   "source": [
+    "# A/B testing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "6922661b-af1b-41d5-bcb0-f2759e890f3e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>click</th>\n",
+       "      <th>no-click</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>A</th>\n",
+       "      <td>50</td>\n",
+       "      <td>50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>B</th>\n",
+       "      <td>55</td>\n",
+       "      <td>45</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   click  no-click\n",
+       "A     50        50\n",
+       "B     55        45"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.DataFrame({\n",
+    "    \"click\":    {\"A\": 50, \"B\": 55},\n",
+    "    \"no-click\": {\"A\": 50, \"B\": 45}\n",
+    "})\n",
+    "df\n",
+    "# Which has the higher CTR A or B?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "31b0de1b-f7fb-42ee-a087-3d884cc31590",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.5712421394829712"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "_, pvalue = stats.fisher_exact(df)\n",
+    "pvalue \n",
+    "# no evidence that A and B are difference because pvalue is not less than 5%"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5dde0ee6-b03e-4bbf-b896-1794a2bff610",
+   "metadata": {},
+   "source": [
+    "### Two situations when pvalue will be lower than significance threshold\n",
+    "\n",
+    "1. Sample size is the same, but skew is very heavy --- unlikely to have that by chance\n",
+    "2. Sample size is large, but skew is small "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "b8719b9e-f366-47a0-9d96-50b4fdd8fb27",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.00042033045869994034"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Scenario 1: \n",
+    "# Sample size is the same, but skew is very heavy --- \n",
+    "# unlikely to have that by chance\n",
+    "\n",
+    "df = pd.DataFrame({\n",
+    "    \"click\":    {\"A\": 50, \"B\": 75},\n",
+    "    \"no-click\": {\"A\": 50, \"B\": 25}\n",
+    "})\n",
+    "_, pvalue = stats.fisher_exact(df)\n",
+    "pvalue"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "72a18c43-0f07-4e7f-bb88-89a43563f295",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.02820356890423392"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Scenario 2: \n",
+    "# Sample size is large, but skew is small \n",
+    "\n",
+    "df = pd.DataFrame({\n",
+    "    \"click\":    {\"A\": 500, \"B\": 550},\n",
+    "    \"no-click\": {\"A\": 500, \"B\": 450}\n",
+    "})\n",
+    "_, pvalue = stats.fisher_exact(df)\n",
+    "pvalue"
+   ]
   }
  ],
  "metadata": {
diff --git a/lecture_material/13-web-3/template_lec_001.ipynb b/lecture_material/13-web-3/template_lec_001.ipynb
index 2b3de51..58a8c2f 100644
--- a/lecture_material/13-web-3/template_lec_001.ipynb
+++ b/lecture_material/13-web-3/template_lec_001.ipynb
@@ -5,7 +5,7 @@
    "id": "cf313adf",
    "metadata": {},
    "source": [
-    "# Web 3: More Flask"
+    "# Web 3: More Flask and A/B testing"
    ]
   },
   {
@@ -16,7 +16,11 @@
    "outputs": [],
    "source": [
     "import requests\n",
-    "import time\n"
+    "import time\n",
+    "\n",
+    "import pandas as pd\n",
+    "# new import statement: requires pip3 install scipy\n",
+    "from scipy import stats"
    ]
   },
   {
@@ -74,7 +78,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6cc81b85",
+   "id": "0eb41e31-490e-419b-af15-db66e0dd6fd1",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -91,6 +95,89 @@
     "    \n",
     "friendly_get(base_url + \"slow\").text"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fcfa857a-10e7-4e74-adcc-9c4d384e9986",
+   "metadata": {},
+   "source": [
+    "#  A/B testing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4839880-5642-49ce-b1cf-938ddf409229",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame({\n",
+    "    \"click\":    {\"A\": 50, \"B\": 55},\n",
+    "    \"no-click\": {\"A\": 50, \"B\": 45}\n",
+    "})\n",
+    "df\n",
+    "# Which has the higher CTR A or B?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "341e9430-d7c2-4cba-a2f1-b5fb95a1efa6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_, pvalue = stats.fisher_exact(df)\n",
+    "pvalue \n",
+    "# no evidence that A and B are difference because pvalue is not less than 5%"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3fa3b996-ef70-4492-8221-dbe597af8541",
+   "metadata": {},
+   "source": [
+    "### Two situations when pvalue will be lower than significance threshold\n",
+    "\n",
+    "1. Sample size is the same, but skew is very heavy --- unlikely to have that by chance\n",
+    "2. Sample size is large, but skew is small "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "245ac3cf-a7b4-4bd4-81d8-caf7fc4897fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Scenario 1: \n",
+    "# Sample size is the same, but skew is very heavy --- \n",
+    "# unlikely to have that by chance\n",
+    "\n",
+    "df = pd.DataFrame({\n",
+    "    \"click\":    {\"A\": 50, \"B\": 75},\n",
+    "    \"no-click\": {\"A\": 50, \"B\": 25}\n",
+    "})\n",
+    "_, pvalue = stats.fisher_exact(df)\n",
+    "pvalue"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e3a1b056-3315-494d-9667-3bfa5ebfa863",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Scenario 2: \n",
+    "# Sample size is large, but skew is small \n",
+    "\n",
+    "df = pd.DataFrame({\n",
+    "    \"click\":    {\"A\": 500, \"B\": 550},\n",
+    "    \"no-click\": {\"A\": 500, \"B\": 450}\n",
+    "})\n",
+    "_, pvalue = stats.fisher_exact(df)\n",
+    "pvalue"
+   ]
   }
  ],
  "metadata": {
diff --git a/lecture_material/13-web-3/template_lec_002.ipynb b/lecture_material/13-web-3/template_lec_002.ipynb
index e47bb0e..24641ed 100644
--- a/lecture_material/13-web-3/template_lec_002.ipynb
+++ b/lecture_material/13-web-3/template_lec_002.ipynb
@@ -5,7 +5,7 @@
    "id": "cf313adf",
    "metadata": {},
    "source": [
-    "# Web 3: More Flask"
+    "# Web 3: More Flask and A/B testing"
    ]
   },
   {
@@ -16,7 +16,11 @@
    "outputs": [],
    "source": [
     "import requests\n",
-    "import time\n"
+    "import time\n",
+    "\n",
+    "import pandas as pd\n",
+    "# new import statement: requires pip3 install scipy\n",
+    "from scipy import stats"
    ]
   },
   {
@@ -105,6 +109,89 @@
     "    \n",
     "friendly_get(base_url + \"slow\").text"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b2353748-f9a6-4de1-80f0-d5a398df5783",
+   "metadata": {},
+   "source": [
+    "# A/B testing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bcc93502-38f4-4e4a-b7b4-e4c868492aa3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame({\n",
+    "    \"click\":    {\"A\": 50, \"B\": 55},\n",
+    "    \"no-click\": {\"A\": 50, \"B\": 45}\n",
+    "})\n",
+    "df\n",
+    "# Which has the higher CTR A or B?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "30438fd0-88fd-4f81-b2b7-dcb96679a489",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_, pvalue = stats.fisher_exact(df)\n",
+    "pvalue \n",
+    "# no evidence that A and B are difference because pvalue is not less than 5%"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "204bb86d-4504-4a15-88d0-9c6b8189c23c",
+   "metadata": {},
+   "source": [
+    "### Two situations when pvalue will be lower than significance threshold\n",
+    "\n",
+    "1. Sample size is the same, but skew is very heavy --- unlikely to have that by chance\n",
+    "2. Sample size is large, but skew is small "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74484f80-8dd4-4035-a329-157e1389084b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Scenario 1: \n",
+    "# Sample size is the same, but skew is very heavy --- \n",
+    "# unlikely to have that by chance\n",
+    "\n",
+    "df = pd.DataFrame({\n",
+    "    \"click\":    {\"A\": 50, \"B\": 75},\n",
+    "    \"no-click\": {\"A\": 50, \"B\": 25}\n",
+    "})\n",
+    "_, pvalue = stats.fisher_exact(df)\n",
+    "pvalue"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "938738ca-2db3-4408-b4e4-2571b89084ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Scenario 2: \n",
+    "# Sample size is large, but skew is small \n",
+    "\n",
+    "df = pd.DataFrame({\n",
+    "    \"click\":    {\"A\": 500, \"B\": 550},\n",
+    "    \"no-click\": {\"A\": 500, \"B\": 450}\n",
+    "})\n",
+    "_, pvalue = stats.fisher_exact(df)\n",
+    "pvalue"
+   ]
   }
  ],
  "metadata": {
-- 
GitLab