{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "cf313adf",
   "metadata": {},
   "source": [
    "# Web 3: More Flask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d55e4bb4-9f29-4f4f-bba6-05054718259b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import time\n",
    "import urllib.robotparser"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "527600aa",
   "metadata": {},
   "source": [
    "### Rate-limited webpage parsing\n",
    "\n",
    "- `requests` module:\n",
    "    - `resp = requests.get(<URL>)` method: enables us to send HTTP GET request\n",
    "    - `resp.status_code`: status code of the response\n",
    "    - `resp.text`: `str` text content of the response\n",
    "    - `resp.headers`: `dict` content of response headers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8241e51c",
   "metadata": {},
   "outputs": [],
   "source": [
    "base_url = \"http://34.123.132.20:5000/\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "23ba100b",
   "metadata": {},
   "source": [
    "### `urllib.robotparser`\n",
    "\n",
    "- Documentation: https://docs.python.org/3/library/urllib.robotparser.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "379c3ae5-7344-45b1-88c3-b35f0bd8eb5b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rp = urllib.robotparser.RobotFileParser()\n",
    "rp.set_url(base_url + \"/robots.txt\")\n",
    "rp.read()\n",
    "rp.can_fetch(\"cs320bot\", base_url + \"/slow\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2e3fb01c-4281-4cbf-8828-98e04d27d09a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rp.can_fetch(\"cs320bot\", base_url + \"/never\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "6cc81b85",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'welcome!'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def friendly_get(url):\n",
    "    if not rp.can_fetch(\"cs320bot\", url):\n",
    "        raise Exception(\"you're not supposed to visit that page\")\n",
    "    while True:\n",
    "        resp = requests.get(url)\n",
    "        if resp.status_code == 429:\n",
    "            seconds = int(resp.headers.get(\"Retry-After\", 1))\n",
    "            print(f\"sleep {seconds}\")\n",
    "            time.sleep(seconds)\n",
    "            continue\n",
    "        resp.raise_for_status() # raise exception if not 200\n",
    "        return resp\n",
    "    \n",
    "friendly_get(base_url + \"/slow\").text"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}