lec10 updated

d643ce73 · gsingh58 · 2d3e3d87 · d643ce73
Commit d643ce73 authored 8 months ago by gsingh58
--- a/lecture_material/10-Graph_Search_2/10-graph_search_001.ipynb
+++ b/lecture_material/10-Graph_Search_2/10-graph_search_001.ipynb
@@ -140,12 +140,6 @@
    "        #    These methods in two different classes just happen to share the same name\n",
    "        self.visited.clear()\n",
    "        return self.nodes[src_name].dfs_search(self.nodes[dst_name])\n",
-    "    \n",
-    "    def bfs_search(self, src_name, dst_name):\n",
-    "        \"\"\"\n",
-    "        Invokes bfs_search using Node object instance with name src_name.\n",
-    "        \"\"\"\n",
-    "        return self.nodes[src_name].bfs_search(self.nodes[dst_name])\n",
    "\n",
    "class Node:\n",
    "    def __init__(self, name):\n",

 %% Cell type:markdown id:d684d88e-e96d-4392-b4d6-92d3f1669b32 tags:
  
 # Breadth First Search (BFS)
  
 - BST (binary search tree) search: find a value; works on BST; search with complexity **O(logN)**
 - DFS (depth first search): finds a path from one node to another with recursive search, it does **not** explore all the children first before the grandchildren or any of the successors -- works on any directed graph
 - BFS (breadth first search): finds a path from one node to another by exploring all children first before the grandchildren or any of the successors  -- works on any directed graph
  
 %% Cell type:code id:66ba808a tags:
  
 ``` python
 # known import statements
 from graphviz import Digraph
 import time
 import random
 import pandas as pd
 import matplotlib.pyplot as plt
  
 # new import statements
 ```
  
 %% Cell type:code id:c00e99eb tags:
  
 ``` python
 def example(num):
    g = Graph()
    if num == 1:
        g.node("A")
        g.edge("B", "C")
        g.edge("C", "D")
        g.edge("D", "B")
    elif num == 2:
        g.edge("A", "B")
        g.edge("B", "C")
        g.edge("C", "D")
        g.edge("D", "E")
        g.edge("A", "E")
    elif num == 3:
        g.edge("A", "B")
        g.edge("A", "C")
        g.edge("B", "D")
        g.edge("B", "E")
        g.edge("C", "F")
        g.edge("C", "G")
    elif num == 4:
        g.edge("A", "B")
        g.edge("A", "C")
        g.edge("B", "D")
        g.edge("B", "E")
        g.edge("C", "F")
        g.edge("C", "G")
        g.edge("E", "Z")
        g.edge("C", "Z")
        g.edge("B", "A")
    elif num == 5:
        width = 8
        height = 4
        for L1 in range(height-1):
            L2 = L1 + 1
            for i in range(width-(height-L1-1)):
                for j in range(width-(height-L2-1)):
                    node1 = str(L1)+"-"+str(i)
                    node2 = str(L2)+"-"+str(j)
                    g.edge(node1, node2)
    else:
        raise Exception("no such example")
    return g
 ```
  
 %% Cell type:markdown id:6690b3be tags:
  
 ### For a regular graph, you need a new class `Graph` to keep track of the whole graph.
 - Why? Remember graphs need not have a "root" node, which means there is no one origin point
  
 %% Cell type:code id:8f5e8b06 tags:
  
 ``` python
 class Graph:
    def __init__(self):
        # name => Node
        self.nodes = {}
        # to keep track which nodes have already been visited
        self.visited = set()
  
    def node(self, name):
        node = Node(name)
        self.nodes[name] = node
        node.graph = self
  
    def edge(self, src, dst):
        """
        Automatically adds missing nodes.
        """
        for name in [src, dst]:
            if not name in self.nodes:
                self.node(name)
        self.nodes[src].children.append(self.nodes[dst])
  
    def _repr_svg_(self):
        """
        Draws the graph nodes and edges iteratively.
        """
        g = Digraph()
        for n in self.nodes:
            g.node(n)
            for child in self.nodes[n].children:
                g.edge(n, child.name)
        return g._repr_image_svg_xml()
  
    def dfs_search(self, src_name, dst_name):
        """
        Clears the visited set and invokes dfs_search using Node object instance
        with name src_name.
        """
        # Q: is this method recursive?
        # A: no, it is just invoking dfs_search method for Node object instance
        #    dfs_search method in Node class is recursive
        #    These methods in two different classes just happen to share the same name
        self.visited.clear()
        return self.nodes[src_name].dfs_search(self.nodes[dst_name])
  
-    def bfs_search(self, src_name, dst_name):
-        """
-        Invokes bfs_search using Node object instance with name src_name.
-        """
-        return self.nodes[src_name].bfs_search(self.nodes[dst_name])
-
 class Node:
    def __init__(self, name):
        self.name = name
        self.children = []
        self.graph = None # back reference
        self.finder = None # who found me during BFS
  
    def __repr__(self):
        return self.name
  
    def dfs_search_v1(self, dst):
        """
        Returns True / False when path to dst is found / not found
        """
        # TODO: what is the simplest case? current node is the dst
        if self in self.graph.visited:
            return False
        self.graph.visited.add(self)
  
        if self == dst:
            return True
  
        for child in self.children:
            if child.dfs_search_v1(dst):
                return True
  
        return False
  
    def dfs_search(self, dst):
        """
        Returns the actual path to the dst as a tuple or None otherwise
        """
        # TODO: what is the simplest case? current node is the dst
        if self in self.graph.visited:
            return None
        self.graph.visited.add(self)
  
        if self == dst:
            return (self,)
  
        for child in self.children:
            child_path = child.dfs_search(dst)
            if child_path != None:
                return (self,) + child_path
  
        return None
  
  
 g = example(1)
 g
 ```
  
 %% Cell type:markdown id:c83e9993-765c-42a0-97f6-6277627acf95 tags:
  
 ### Testcases for DFS
  
 %% Cell type:code id:d25ec05c-f6b8-4843-8107-64a4b79c5b5f tags:
  
 ``` python
 print(g.dfs_search("B", "A")) # should return None
 print(g.dfs_search("B", "D")) # should return (B, C, D)
 ```
  
 %% Cell type:markdown id:a92903d6 tags:
  
 ### Why is it called "*Depth* First Search"?
  
 - we start at the starting node and go as deep as possible because recursion always goes as deep as possible before coming back to the other children in the previous level
 - we need a `Stack` data structure:
    - Last-In-First-Out (LIFO)
 - recursion naturally uses `Stack`, which is why we don't have to explicitly use a `Stack` data structure
 - might not give us the shortest possible path
  
 %% Cell type:code id:e5baef43 tags:
  
 ``` python
 g = example(2)
 g
 ```
  
 %% Cell type:code id:2a0b9c3d-1838-433d-82dd-14ba3859198f tags:
  
 ``` python
 print(g.dfs_search("A", "E")) # should return (A, B, C, D, E)
 print(g.dfs_search("E", "A")) # should return None
 ```
  
 %% Cell type:markdown id:378d7ce7 tags:
  
 ### Breadth first search
  
 - find the shortest path by exploring all children first before the grandchildren or any of the successors
 - we need a `Queue` data structure:
    - First-In-First-Out (FIFO)
 - unlike DFS, BFS gives us the shortest possible path
  
 %% Cell type:code id:75e190d3 tags:
  
 ``` python
 # TODO: let's define bfs_search method
 ```
  
 %% Cell type:code id:29031c60-26c4-4ce6-aeb2-7d34efce50e5 tags:
  
 ``` python
 g = example(3)
 g
 ```
  
 %% Cell type:code id:3aef637c-068c-4fa7-b8cc-e7a069bc2ebb tags:
  
 ``` python
 print(g.bfs_search("A", "D"))
 ```
  
 %% Cell type:code id:c15583e6-86e7-4a82-86ef-1cf1a5170fab tags:
  
 ``` python
 g = example(2)
 g
 ```
  
 %% Cell type:code id:2bdc2e53-5d49-431d-a5fd-73de64baf9b4 tags:
  
 ``` python
 print(g.bfs_search("A", "E"))
 ```
  
 %% Cell type:code id:d7fd243b-ae5f-42f6-b52a-33cf2fa93d64 tags:
  
 ``` python
 g = example(1)
 g
 ```
  
 %% Cell type:code id:27943344-0e56-41e1-b3d8-306530b419a3 tags:
  
 ``` python
 print(g.bfs_search("B", "D")) # should return (B, C, D)
 ```
  
 %% Cell type:code id:94e7bc72-8158-40e9-b615-573fc9506840 tags:
  
 ``` python
 print(g.bfs_search("B", "A")) # should return None
 # what's wrong?
 ```
  
 %% Cell type:markdown id:f25a8642-227a-4e7c-a2c9-6677dc7f5fde tags:
  
 ### How do we find the path using BFS?
  
 %% Cell type:code id:3e5053d9-ff2a-41a0-987c-84a0880f4f13 tags:
  
 ``` python
 g = example(3)
 print(g.bfs_search("A", "E"))
 g
 ```
  
 %% Cell type:code id:34adcada-e97d-4725-912f-f519cbe68bfd tags:
  
 ``` python
 g.nodes["E"]
 ```
  
 %% Cell type:code id:bd5239e5-d7a4-4277-a30c-71f4eac0eaf6 tags:
  
 ``` python
 ```
  
 %% Cell type:code id:d0790031-14df-4416-a666-95b0e3b1f5f2 tags:
  
 ``` python
 ```
  
 %% Cell type:code id:a9f87651-cdd2-46a0-8fbc-0f349fbea8e7 tags:
  
 ``` python
 # TODO: let's go back and implement a backtrace method to help us trace back this path
 ```
  
 %% Cell type:markdown id:3b1b32c7 tags:
  
 ### BFS complexity
  
 - focus on the data structures that we use and identify the single line of code that is a slow operation
 - Assuming N nodes in the directed graph, what will the current algorithm's complexity be?
    - `O(N**2)`
    - Why? to_visit.pop(0) will be O(N) operation because we need to slide the list items over every time we remove the first item
  
 - Can we make this algorithm better by using a different data structure than a list?
  
 %% Cell type:markdown id:90d36820 tags:
  
 ### Queueing Structures
  
 - Stack:
    - First-In-Last-Out(FILO) or Last-In-First-Out(LIFO)
 - Queue:
    - First-In-First-Out (FIFO)
 - Priority Queue:
    - Highest priority item is kept first always
  
 <div>
 <img src="attachment:work_queue_slow_operations.png" width="600"/>
 </div>
  
 Slow operations:
 - `x = L.pop(0)`
 - `L.sort()`
  
 %% Cell type:markdown id:1ae078cd tags:
  
 ### Using `deque` to improve "BFS" complexity
  
 - Documentation: https://docs.python.org/3/library/collections.html#collections.deque
 - Search for "O(" in the documentation to get a sense of complexity of various operations. Here are a few highlights:
    - memory efficient appends and pops from either side of the deque with approximately the same **O(1)** performance in either direction
    - Indexed access is O(1) at both ends but slows to O(n) in the middle. For fast random access, use lists instead.
  
 - Operations:
    - `collections.deque([iterable[, maxlen]])`
    - `append(x)` ---> **O(1)** operation, equally efficient when compared to list's **O(1)** for adding item at the end
    - `popleft()` ---> **O(1)** operation, highly efficient when compared to list's **O(N)** for popping item at index 0
  
 %% Cell type:markdown id:bba7b04c tags:
  
 What are some examples of `iterable`? `list`, `str`, `range(...)`, `dict`, basically anything that can be iterated using a `for` loop.
  
 %% Cell type:markdown id:df5a1b45 tags:
  
 Create a deque by passing a list with 3 items as argument.
  
 %% Cell type:code id:1b0bcad4 tags:
  
 ``` python
 d = ???
 d
 ```
  
 %% Cell type:markdown id:ad7ea162 tags:
  
 Append a new item to your deque.
  
 %% Cell type:code id:9bb00305 tags:
  
 ``` python
  
 d
 ```
  
 %% Cell type:markdown id:a7b72757 tags:
  
 Pop the item at index 0 using `popleft`.
  
 %% Cell type:code id:3dcbd60e tags:
  
 ``` python
 item = ???
 print(item, d)
 ```
  
 %% Cell type:markdown id:420abec2 tags:
  
 ### Go back to `bfs_search` and update our `to_visit` list usage to use a `deque` instead
  
 %% Cell type:code id:5ddae33d tags:
  
 ``` python
 # TODO: update bfs_search to make the implementation efficient
 ```
  
 %% Cell type:markdown id:af3c077f tags:
  
 ### What if we wanted to explore nodes in a different order other than `DFS` and `BFS` ordering?
  
 <div>
 <img src="attachment:shortest_path.png" width="600"/>
 </div>
  
 %% Cell type:markdown id:a91d9616 tags:
  
 ### Using `heapq` (priority queue)
  
 <div>
 <img src="attachment:Priority_queue.png" width="600"/>
 </div>
  
 %% Cell type:markdown id:35dcf2c1 tags:
  
 **IMPORTANT**: Unlike `deque`, `heapq` is not a type by itself.
  
 - Operations:
    - `heapq.heappush(LIST, VAL)`:
        - adds VAL to the list
        - shuffles around the values so that the smallest value is in the front
        - does so in **O(log N)** time (so it is not sorting, but just shuffling)
    - `val = heapq.heappop(LIST)`
        - removes the smallest item from the list
        - re-shuffles around the values so that the smallest value is in the front
        - does so in **O(log N)** time (so it is not sorting, but just shuffling)
  
 %% Cell type:code id:2b05ffde tags:
  
 ``` python
 data = []
 vals = [5, 3, 1, 4, 2]
 ```
  
 %% Cell type:code id:e1200db8 tags:
  
 ``` python
 ```
  
 %% Cell type:markdown id:d3712e2e tags:
  
 ### Benchmarking `stack`, `queue`, and `priority queue` for performance
  
 %% Cell type:code id:9ddfb57d tags:
  
 ``` python
 iters = 1000
  
 def benchmark_microsec(data, pattern):
    """
    Benchmarking "stack", "queue", "priority queue"
    implementations using a list data structure
    """
    t0 = time.time()
    # measure bad ways to implement the patterns (all with a list!)
    for i in range(iters):
        if pattern == "stack":
            data.append(i % 10)
            _ = data.pop(-1)
        elif pattern == "queue":
            pass
            # TODO: complete the code for queue
        elif pattern == "prio queue":
            pass
            # TODO: complete the code for priority queue
        else:
            raise Exception("pattern not supported")
    t1 = time.time()
    return (t1-t0) / iters * 1e6
  
 df = pd.DataFrame()
 for N in [1000,2000,5000,10000]:
    df.loc[N,"stack"] = benchmark_microsec([1]*N, "stack")
  
 plt.rcParams["font.size"] = 16
 df.plot.line(ylim=0)
 plt.xlabel("N")
 plt.ylabel("Microseconds")
 df
 ```
  
 %% Cell type:code id:52bc7350 tags:
  
 ``` python
 def benchmark_microsec_v2(data, pattern):
    """
    Benchmarking "stack", "queue", "priority queue"
    implementations using list, deque, and heapq list data structures
    """
 ```