# Web 1: Selenium

- Operations:
 - `b.get(URL)`: sends HTTP GET request to the URL
 - `b.page_source`: HTML source for the page

In [None]:
import os
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

from selenium import webdriver

from webdriver_manager.chrome import ChromeDriverManager
from IPython.display import display, Image

import time
import pandas as pd

from collections import deque
from graphviz import Digraph

# os.system("pkill -f -9 chromium")
# os.system("pkill -f -9 chrome")

In [None]:
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

service = Service(ChromeDriverManager().install())

b = webdriver.Chrome(options=options, service=service)

## Tricky pages

### page1.html: Javascript table example

### Selenium operations

- Operations:
 - `b.get(URL)`: sends HTTP GET request to the URL
 - `b.page_source`: HTML source for the page
 - `b.find_elements("id", <ID>)`: searches for a specific element that matches the "id"
 - `b.find_elements("tag name", <TAG>)`: searches for a specific element using corresponding tag name
 - `b.find_element` versus `b.find_elements`:
 - `find_element` gives first match
 - `find_elements` gives all matches
 - `<element obj>.text`: gives text associated with that element

### POLLING: How would we know when the updated page becomes available?
- keep checking regularly until you get all the details you are looking for.

In [None]:
url = "https://cs320.cs.wisc.edu/tricky/page1.html"
b.get(url)

while True:
 tbls = b.find_elements("tag name", "table")
 print("Tables:", len(tbls))
 
 if len(tbls) == 2:
 print(tbls)
 break
 
 time.sleep(0.1) # sleep for 0.1 second

### Let's extract the 2nd table information

In [None]:
tbl = tbls[-1]

# TODO: find all tr elements
trs = tbl.find_elements("tag name", "tr")

# TODO: find all td elements
# TODO: extract text for all td elements into a list of list
rows = []

for tr in trs:
 tds = tr.find_elements("tag name", "td")
 assert len(tds) == 2
 rows.append([tds[0].text, tds[1].text])
 
rows

### Converting `rows` into a `DataFrame`

In [None]:
pd.DataFrame(rows)

### How can we visually see the page on the VM?

- Operations:
 - `b.save_screenshot("some_file.png")`: saves a screenshot of the rendered page
 - `b.set_window_size(<width>, <height>)`: controls size of the image
 - import statement: `from IPython.display import display, Image`: helps us show the screenshot as an image inside the notebook

### Combining taking screenshot and displaying it
- useful for p3

In [None]:
def show_screen(width, height):
 b.save_screenshot("out.png")
 b.set_window_size(width, height)
 display(Image("out.png"))

### page2.html: "Show More!" button example

- Operations:
 `button_oject.click()`: enables us to click the button

In [None]:
url = "https://cs320.cs.wisc.edu/tricky/page2.html"
b.get(url)

In [None]:
# TODO: find the id for the more button (inspect element on browser)
button = b.find_element("id", "???")

In [None]:
# TODO: click the button

# keep running this cell reptitively
# once all data is retrieved, we will run into NoSuchElementException

In [None]:
b.get(url)

while True:
 try:
 button = b.find_element("id", "more")
 button.click()
 show_screen(500, 500)
 print("============================================================")
 except NoSuchElementException:
 print("We have all the data!")
 break
 time.sleep(1)

In [None]:
print(b.page_source)

### page 3: password protection example

- Operations:
 `text_object.send_keys()`: enables us to send data to textbox

In [None]:
url = "https://cs320.cs.wisc.edu/tricky/page3.html"
b.get(url)

In [None]:
# TODO: find the id for password box (inspect element on browser)
# TODO: find the id for the login button (inspect element on browser)
text = b.find_element("id", "")
button = b.find_element("id", "")

# TODO: send the password (plain text just for example purposes)

show_screen()

# TODO: click the button

show_screen()

In [None]:
print(b.page_source)

### page 4: search data for a year

- Operations:
 `text_object.clear()`: enables us to clear the previous text

In [None]:
url = "https://cs320.cs.wisc.edu/tricky/page4.html"
b.get(url)

In [None]:
# TODO: find the id for year box (inspect element on browser)
# TODO: find the id for the search button (inspect element on browser)
text = b.find_element("id", "")
button = b.find_element("id", "")

In [None]:
text.send_keys("1952")
button.click()
show_screen()
# TODO: run this cell twice

#### How many hurricanes were there each year?

In [None]:

for year in range(1950, 1960):
 text.clear()
 text.send_keys(???)
 button.click()
 show_screen()
 
 # TODO: find all tr elements and count hurricanes for each year
 
 # TODO: We have to subtract 1 for removing header tr element
 
 
# ax = hurricane_counts.plot.line()
# ax.set_xlabel("Year")
# ax.set_ylabel("Hurricane count")

## Recursive Crawl

- crawling: process of finding all the webpages inside a website

In [None]:
# TODO: initialize url, send GET request, and display page source
url = "https://cs320.cs.wisc.edu/crawl/practice1/1.html"

print(b.page_source)

In [None]:
# TODO: show the screen


### Final all hyperlinks

- Selenium operations:
 - `b.get(URL)`: sends HTTP GET request to the URL
 - `b.page_source`: HTML source for the page
 - `b.find_elements("id", <ID>)`: searches for a specific element that matches the "id"
 - `b.find_elements("tag name", <TAG>)`: searches for a specific element using corresponding tag name
 - `b.find_element` versus `b.find_elements`:
 - `find_element` gives first match
 - `find_elements` gives all matches
 - `<element obj>.text`: gives text associated with that element 
 - `<element obj>.get_attribute(<attribute>)`: gives attribute value; for ex: `<anchor_obj>.get_attribute("href")`
 
 - `b.save_screenshot("some_file.png")`: saves a screenshot of the rendered page
 - `b.set_window_size(<width>, <height>)`: controls size of the image
 - import statement: `from IPython.display import display, Image`: helps us show the screenshot as an image inside the notebook
 - `button_oject.click()`: enables us to click the button
 - `text_object.send_keys()`: enables us to send data to textbox

In [None]:
# TODO: find all a elements, and then 
# TODO: loop over all the a elements to print text and use get_attribute to print href value of each a element
a_elements = b.find_elements("tag name", "a")
for a_element in a_elements:
 print(a_element.text, a_element.get_attribute("href"))

In [None]:
# TODO: Generalize to a function
def get_children(url):
 """
 Finds all hyperlinks in the given url by sending GET request and parsing page source.
 Returns a list of children URLs.
 """
 pass

url = "https://cs320.cs.wisc.edu/crawl/practice1/1.html"
get_children(url)

### Breadth First Search

- for crawling, there is no specific "destination", as we need to find all the webpages.

In [None]:
start_url = "https://cs320.cs.wisc.edu/crawl/practice1/1.html"
#start_url = "https://cs320.cs.wisc.edu/crawl/practice7/1.html"

# Why use a set to keep track of visited nodes?

# TODO: create a Digraph


 # TODO: add current node to digraph
 
 # TODO: how do we get all the children?
 
 
 # TODO: add an edge
 