leec11 modified

2c730788 · gsingh58 · d9e18bcf · 2c730788 · 2c730788
Commit 2c730788 authored 8 months ago by gsingh58
--- a/lecture_material/11-Web_1/11-web.ipynb
+++ b/lecture_material/11-Web_1/11-web.ipynb
--- a/lecture_material/11-Web_1/11-web_001.ipynb
+++ b/lecture_material/11-Web_1/11-web_001.ipynb
@@ -59,6 +59,14 @@
    "b = webdriver.Chrome(options=options, service=service)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "722db1fa-b151-4ede-b895-6162aafc4843",
+   "metadata": {},
+   "source": [
+    "## Tricky pages"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "fcabc9ed-2b56-4b84-b66f-f1af5c556743",
@@ -431,7 +439,7 @@
   "id": "c8543c5b",
   "metadata": {},
   "source": [
-    "# Web 2: Recursive Crawl\n",
+    "## Recursive Crawl\n",
    "\n",
    "- crawling: process of finding all the webpages inside a website"
   ]

 %% Cell type:markdown id:e172ecb9 tags:

 # Web 1: Selenium

 - Operations:
    - `b.get(URL)`: sends HTTP GET request to the URL
    - `b.page_source`: HTML source for the page

 %% Cell type:code id:6f9bec30-8c27-4cc6-8e64-b44c46bd34c6 tags:

 ``` python
 import os
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from selenium.common.exceptions import NoSuchElementException

 from selenium import webdriver

 from webdriver_manager.chrome import ChromeDriverManager
 from IPython.display import display, Image

 import time
 import pandas as pd

 from collections import deque
 from graphviz import Digraph

 # os.system("pkill -f -9 chromium")
 # os.system("pkill -f -9 chrome")
 ```

 %% Cell type:code id:371ee5f8-22fd-4bdc-8878-b8467f53223d tags:

 ``` python
 options = Options()
 options.add_argument("--headless")
 options.add_argument("--no-sandbox")
 options.add_argument("--disable-dev-shm-usage")

 service = Service(ChromeDriverManager().install())

 b = webdriver.Chrome(options=options, service=service)
 ```

+%% Cell type:markdown id:722db1fa-b151-4ede-b895-6162aafc4843 tags:
+
+## Tricky pages
+
 %% Cell type:markdown id:fcabc9ed-2b56-4b84-b66f-f1af5c556743 tags:

 ### page1.html: Javascript table example

 %% Cell type:markdown id:b8e7031b tags:

 ### Selenium operations

 - Operations:
    - `b.get(URL)`: sends HTTP GET request to the URL
    - `b.page_source`: HTML source for the page
    - `b.find_elements("id", <ID>)`: searches for a specific element that matches the "id"
    - `b.find_elements("tag name", <TAG>)`: searches for a specific element using corresponding tag name
    - `b.find_element` versus `b.find_elements`:
        - `find_element` gives first match
        - `find_elements` gives all matches
    - `<element obj>.text`: gives text associated with that element

 %% Cell type:markdown id:15b97efd-7f40-4038-bf12-78c1a745d276 tags:

 ### POLLING: How would we know when the updated page becomes available?
 - keep checking regularly until you get all the details you are looking for.

 %% Cell type:code id:662a2ae1-8077-45e1-bde9-9e179720d26e tags:

 ``` python
 url = "https://cs320.cs.wisc.edu/tricky/page1.html"
 b.get(url)

 while True:
    tbls = b.find_elements("tag name", "table")
    print("Tables:", len(tbls))

    if len(tbls) == 2:
        print(tbls)
        break

    time.sleep(0.1) # sleep for 0.1 second
 ```

 %% Cell type:markdown id:c70f9430-9ae5-4d70-9355-4b113b9fc20a tags:

 ### Let's extract the 2nd table information

 %% Cell type:code id:0c626766-4a91-482f-bd01-77a56a7a2c0f tags:

 ``` python
 tbl = tbls[-1]

 # TODO: find all tr elements
 trs = tbl.find_elements("tag name", "tr")

 # TODO: find all td elements
 # TODO: extract text for all td elements into a list of list
 rows = []

 for tr in trs:
    tds = tr.find_elements("tag name", "td")
    assert len(tds) == 2
    rows.append([tds[0].text, tds[1].text])

 rows
 ```

 %% Cell type:markdown id:358cfbab-5db5-4f9b-a205-d78a74cf04ca tags:

 ### Converting `rows` into a `DataFrame`

 %% Cell type:code id:d5084534-db73-4766-bded-2cf50a3fad0c tags:

 ``` python
 pd.DataFrame(rows)
 ```

 %% Cell type:markdown id:a9358137-a2d7-4ebb-96a3-4bb8513084a4 tags:

 ### How can we visually see the page on the VM?

 - Operations:
    - `b.save_screenshot("some_file.png")`: saves a screenshot of the rendered page
    - `b.set_window_size(<width>, <height>)`: controls size of the image
    - import statement: `from IPython.display import display, Image`: helps us show the screenshot as an image inside the notebook

 %% Cell type:code id:0eb2a2cf-02ee-4d8a-baaf-48c7b1f118fc tags:

 ``` python
 ```

 %% Cell type:code id:38316989-871e-4b84-b877-b06740a09533 tags:

 ``` python
 ```

 %% Cell type:markdown id:3e124fd0-bf7c-4e0a-9d38-39dac9186e97 tags:

 ### Combining taking screenshot and displaying it
 - useful for p3

 %% Cell type:code id:607f4c72-e1eb-41ea-8124-7618f8e8efd3 tags:

 ``` python
 def show_screen(width, height):
    b.save_screenshot("out.png")
    b.set_window_size(width, height)
    display(Image("out.png"))
 ```

 %% Cell type:markdown id:3a742134-466f-45be-9289-eafb9029d765 tags:

 ### page2.html: "Show More!" button example

 - Operations:
    `button_oject.click()`: enables us to click the button

 %% Cell type:code id:e6a13d39-ae0f-4bb7-9146-ae27d5255fbd tags:

 ``` python
 url = "https://cs320.cs.wisc.edu/tricky/page2.html"
 b.get(url)
 ```

 %% Cell type:code id:fe8866c9-7d2e-4bab-b533-6abd4d00d3ef tags:

 ``` python
 # TODO: find the id for the more button (inspect element on browser)
 button = b.find_element("id", "???")
 ```

 %% Cell type:code id:6744a392-304c-410e-88ad-4dfd457547a3 tags:

 ``` python
 # TODO: click the button

 # keep running this cell reptitively
 # once all data is retrieved, we will run into NoSuchElementException
 ```

 %% Cell type:code id:3f234387-417d-4cc1-8571-5a02778db2cd tags:

 ``` python
 b.get(url)

 while True:
    try:
        button = b.find_element("id", "more")
        button.click()
        show_screen(500, 500)
        print("============================================================")
    except NoSuchElementException:
        print("We have all the data!")
        break
    time.sleep(1)
 ```

 %% Cell type:code id:603af6f2-a8a1-41ec-8c6e-a24286482e4e tags:

 ``` python
 print(b.page_source)
 ```

 %% Cell type:markdown id:f57b79ca-5e8d-48d1-82d4-1456598ca5a0 tags:

 ### page 3: password protection example

 - Operations:
    `text_object.send_keys()`: enables us to send data to textbox

 %% Cell type:code id:bf91d59d-6468-4e78-866b-a1f9351e7413 tags:

 ``` python
 url = "https://cs320.cs.wisc.edu/tricky/page3.html"
 b.get(url)
 ```

 %% Cell type:code id:fb734feb-df0d-4d19-89b2-1cc7034f00aa tags:

 ``` python
 # TODO: find the id for password box (inspect element on browser)
 # TODO: find the id for the login button (inspect element on browser)
 text = b.find_element("id", "")
 button = b.find_element("id", "")

 # TODO: send the password (plain text just for example purposes)

 show_screen()

 # TODO: click the button

 show_screen()
 ```

 %% Cell type:code id:d85f4319-a345-4d30-b7aa-2d344dc658c5 tags:

 ``` python
 print(b.page_source)
 ```

 %% Cell type:markdown id:ba660e42-1d67-4c1e-b69f-6bcf25be4b28 tags:

 ### page 4: search data for a year

 - Operations:
    `text_object.clear()`: enables us to clear the previous text

 %% Cell type:code id:a8ea2227-6d6f-405b-8610-01f627d82b5d tags:

 ``` python
 url = "https://cs320.cs.wisc.edu/tricky/page4.html"
 b.get(url)
 ```

 %% Cell type:code id:276c988a-bacc-4535-9a5f-2e68fc2910b0 tags:

 ``` python
 # TODO: find the id for year box (inspect element on browser)
 # TODO: find the id for the search button (inspect element on browser)
 text = b.find_element("id", "")
 button = b.find_element("id", "")
 ```

 %% Cell type:code id:ff31ce65-12a9-4629-aee5-56e0bbbf9698 tags:

 ``` python
 text.send_keys("1952")
 button.click()
 show_screen()
 # TODO: run this cell twice
 ```

 %% Cell type:markdown id:a892d964-7e87-431e-8f45-81664f159e10 tags:

 #### How many hurricanes were there each year?

 %% Cell type:code id:ab822af8-6d66-46b6-bf19-28d332e4b01e tags:

 ``` python

 for year in range(1950, 1960):
    text.clear()
    text.send_keys(???)
    button.click()
    show_screen()

    # TODO: find all tr elements and count hurricanes for each year

    # TODO: We have to subtract 1 for removing header tr element


 # ax = hurricane_counts.plot.line()
 # ax.set_xlabel("Year")
 # ax.set_ylabel("Hurricane count")
 ```

 %% Cell type:markdown id:c8543c5b tags:

-# Web 2: Recursive Crawl
+## Recursive Crawl

 - crawling: process of finding all the webpages inside a website

 %% Cell type:code id:9e60c940-2624-4d78-a730-03056af72297 tags:

 ``` python
 # TODO: initialize url, send GET request, and display page source
 url = "https://cs320.cs.wisc.edu/crawl/practice1/1.html"

 print(b.page_source)
 ```

 %% Cell type:code id:5ada2baa tags:

 ``` python
 # TODO: show the screen
 ```

 %% Cell type:markdown id:6f3c1cfc tags:

 ### Final all hyperlinks

 - Selenium operations:
    - `b.get(URL)`: sends HTTP GET request to the URL
    - `b.page_source`: HTML source for the page
    - `b.find_elements("id", <ID>)`: searches for a specific element that matches the "id"
    - `b.find_elements("tag name", <TAG>)`: searches for a specific element using corresponding tag name
    - `b.find_element` versus `b.find_elements`:
        - `find_element` gives first match
        - `find_elements` gives all matches
    - `<element obj>.text`: gives text associated with that element
    - `<element obj>.get_attribute(<attribute>)`: gives attribute value; for ex: `<anchor_obj>.get_attribute("href")`

    - `b.save_screenshot("some_file.png")`: saves a screenshot of the rendered page
    - `b.set_window_size(<width>, <height>)`: controls size of the image
    - import statement: `from IPython.display import display, Image`: helps us show the screenshot as an image inside the notebook
    - `button_oject.click()`: enables us to click the button
    - `text_object.send_keys()`: enables us to send data to textbox

 %% Cell type:code id:ddab23c0 tags:

 ``` python
 # TODO: find all a elements, and then
 # TODO: loop over all the a elements to print text and use get_attribute to print href value of each a element
 a_elements = b.find_elements("tag name", "a")
 for a_element in a_elements:
    print(a_element.text, a_element.get_attribute("href"))
 ```

 %% Cell type:code id:8a9a54b4 tags:

 ``` python
 # TODO: Generalize to a function
 def get_children(url):
    """
    Finds all hyperlinks in the given url by sending GET request and parsing page source.
    Returns a list of children URLs.
    """
    pass

 url = "https://cs320.cs.wisc.edu/crawl/practice1/1.html"
 get_children(url)
 ```

 %% Cell type:markdown id:3741f5aa tags:

 ### Breadth First Search

 - for crawling, there is no specific "destination", as we need to find all the webpages.

 %% Cell type:code id:b8c61a4f tags:

 ``` python
 start_url = "https://cs320.cs.wisc.edu/crawl/practice1/1.html"
 #start_url = "https://cs320.cs.wisc.edu/crawl/practice7/1.html"

 # Why use a set to keep track of visited nodes?

 # TODO: create a Digraph


    # TODO: add current node to digraph

    # TODO: how do we get all the children?


        # TODO: add an edge

 ```