diff --git a/lecture_material/14-web-4-and-regex-1/web_4.py b/lecture_material/14-web-4/web_4.py similarity index 100% rename from lecture_material/14-web-4-and-regex-1/web_4.py rename to lecture_material/14-web-4/web_4.py diff --git a/lecture_material/14-web-4-and-regex-1/web_4_lec_001.py b/lecture_material/14-web-4/web_4_lec_001.py similarity index 100% rename from lecture_material/14-web-4-and-regex-1/web_4_lec_001.py rename to lecture_material/14-web-4/web_4_lec_001.py diff --git a/lecture_material/14-web-4-and-regex-1/web_4_lec_002.py b/lecture_material/14-web-4/web_4_lec_002.py similarity index 100% rename from lecture_material/14-web-4-and-regex-1/web_4_lec_002.py rename to lecture_material/14-web-4/web_4_lec_002.py diff --git a/lecture_material/14-web-4-and-regex-1/14-web-4-and-regex-1.pdf b/lecture_material/15-regex/15-regex.pdf similarity index 100% rename from lecture_material/14-web-4-and-regex-1/14-web-4-and-regex-1.pdf rename to lecture_material/15-regex/15-regex.pdf diff --git a/lecture_material/14-web-4-and-regex-1/14-web-4-and-regex-1.pptx b/lecture_material/15-regex/15-regex.pptx similarity index 100% rename from lecture_material/14-web-4-and-regex-1/14-web-4-and-regex-1.pptx rename to lecture_material/15-regex/15-regex.pptx diff --git a/lecture_material/14-web-4-and-regex-1/regex_1.ipynb b/lecture_material/15-regex/regex.ipynb similarity index 92% rename from lecture_material/14-web-4-and-regex-1/regex_1.ipynb rename to lecture_material/15-regex/regex.ipynb index 5b836e41a9c474fce19fae01502363065f70dc31..34105575ab8e8e2c41b106e14432b20ec24d21df 100644 --- a/lecture_material/14-web-4-and-regex-1/regex_1.ipynb +++ b/lecture_material/15-regex/regex.ipynb @@ -10,7 +10,7 @@ "id": "e60c1c48", "metadata": {}, "source": [ - "# Regex 1\n", + "# Regex\n", "\n", "## Reading\n", "\n", @@ -128,7 +128,9 @@ "outputs": [], "source": [ "#import statements\n", - "import re" + "import re\n", + "from subprocess import check_output\n", + "import pandas as pd" ] }, { @@ -1404,6 +1406,1188 @@ "source": [ "In CS <b>320</b>, there are <b>40</b> lectures, <b>10</b> quizzes, <b>3</b> exams, <b>6</b> projects, and <b>1000</b> things to learn. CS <b>320</b> is awesome!" ] + }, + { + "cell_type": "markdown", + "id": "d53219fc-977e-41af-b6c6-c7ab7170ee13", + "metadata": {}, + "source": [ + "### Git log example" + ] + }, + { + "cell_type": "markdown", + "id": "aaf31f0b-b3d4-40a3-a597-55f18c1d9373", + "metadata": {}, + "source": [ + "#### Run `git log` as a shell command" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "23004baa-f037-4494-af50-de321f149f44", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mcommit d53c732e0da3ad0cd625c25e02703face75511ee\u001b[m\u001b[33m (\u001b[m\u001b[1;36mHEAD -> \u001b[m\u001b[1;32mmain\u001b[m\u001b[33m, \u001b[m\u001b[1;31morigin/main\u001b[m\u001b[33m, \u001b[m\u001b[1;31morigin/HEAD\u001b[m\u001b[33m)\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Mon Mar 18 20:43:56 2024 -0500\n", + "\n", + " zip folders added to lec17\n", + "\n", + "\u001b[33mcommit 0eba23b48835a617b53191de5aa8f2709517bb7a\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Mon Mar 18 20:36:44 2024 -0500\n", + "\n", + " lec16 and lec17 updated\n", + "\n", + "\u001b[33mcommit 844963cd59c769ef379b0272140b554aee6d9e60\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Thu Mar 14 13:00:40 2024 -0500\n", + "\n", + " lab11\n", + "\n", + "\u001b[33mcommit 31882131674f49276c66a6764113d5347721f239\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Thu Mar 14 06:39:42 2024 -0500\n", + "\n", + " number of lectures updated in regex_2.ipynb\n", + "\n", + "\u001b[33mcommit 08b3243a9ed2a67cdcfcbbb080910f8c86716712\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Wed Mar 13 10:05:24 2024 -0500\n", + "\n", + " lec14&15 updatd\n", + "\n", + "\u001b[33mcommit f5b5fff2409b01c2e12f3a7baae131e9a23d964b\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Wed Mar 13 09:39:09 2024 -0500\n", + "\n", + " regex_2_lec_002 renamed as regex_1_lec_002\n", + "\n", + "\u001b[33mcommit e017cc00d97679786d97c469733117c016e3ba9b\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Wed Mar 13 09:34:43 2024 -0500\n", + "\n", + " web_4_lec_002 added\n", + "\n", + "\u001b[33mcommit 7897fec9ecf701e61b2ed2713fabe72e726ffd7c\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Tue Mar 12 05:51:48 2024 -0500\n", + "\n", + " lec14 and lec15 added\n", + "\n", + "\u001b[33mcommit c13cfb21e69230b393ef2051027d029322c12cac\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Mon Mar 11 10:36:36 2024 -0500\n", + "\n", + " Update file README.md\n", + "\n", + "\u001b[33mcommit 95bfde13c281a6eb28f0cd98f6dc393762732d4c\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sat Mar 9 18:45:08 2024 -0600\n", + "\n", + " lab10\n", + "\n", + "\u001b[33mcommit 90111df9e72309597c67da15bb61de78bff126e7\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sat Mar 9 18:43:47 2024 -0600\n", + "\n", + " lab9\n", + "\n", + "\u001b[33mcommit c401ade096b0dd3e1178f46d067b2fbb98d499f3\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sat Mar 9 18:43:20 2024 -0600\n", + "\n", + " Update 2 files\n", + " \n", + " - /Labs/Lab9/EDGAR.md\n", + " - /Labs/Lab9/README.md\n", + "\n", + "\u001b[33mcommit c99c65b5efdaa91f8fce8fadffc41c4747e0a3a0\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Thu Mar 7 08:36:02 2024 -0600\n", + "\n", + " lec13 updated\n", + "\n", + "\u001b[33mcommit df5877233cc57005e3003c19c0dbf89aafc53804\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Thu Mar 7 07:02:16 2024 -0600\n", + "\n", + " lec12 & 13 updated\n", + "\n", + "\u001b[33mcommit 26470de563bbe3bc65a73718aee1988ba64e8601\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Tue Mar 5 05:22:26 2024 -0600\n", + "\n", + " lec12 and 13: ipynb files added\n", + "\n", + "\u001b[33mcommit 406a4231cd865f81653eb97f7d5f61136360f9e5\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sun Mar 3 17:41:42 2024 -0600\n", + "\n", + " Update file README.md\n", + "\n", + "\u001b[33mcommit 64745e91caeed8149cb33be037b795f88b4ddd39\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Sun Mar 3 05:03:29 2024 -0600\n", + "\n", + " lec12 and lec13 updated\n", + "\n", + "\u001b[33mcommit 42568d125bb87a10c178ec1d4640e467f2d826b0\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Wed Feb 28 22:31:11 2024 -0600\n", + "\n", + " lab8\n", + "\n", + "\u001b[33mcommit e00ff203a601993ec07c23420ff223707228e820\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Wed Feb 28 22:26:10 2024 -0600\n", + "\n", + " lab6\n", + "\n", + "\u001b[33mcommit 408cff34905add795fb9cce796913372b184a60b\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Wed Feb 28 22:25:47 2024 -0600\n", + "\n", + " add lab7\n", + "\n", + "\u001b[33mcommit dd5c119d6d72ee039a3eb10608d2f57f56933d72\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Wed Feb 28 22:22:15 2024 -0600\n", + "\n", + " add lab6\n", + "\n", + "\u001b[33mcommit 27ce693c05fe8beae4bd2b15715454c6c3c3f3f7\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Tue Feb 27 00:24:06 2024 -0600\n", + "\n", + " lec11 updated\n", + "\n", + "\u001b[33mcommit 8fda6507879ad4a220675d7af1dcc4f2dc6eb8cb\u001b[m\n", + "Author: gsingh58 <gurmail-singh@cs.wisc.edu>\n", + "Date: Thu Feb 22 16:57:25 2024 -0600\n", + "\n", + " redundant html files deleted\n", + "\n", + "\u001b[33mcommit 3693ad5b1f9ff5e2ed5af15baac3579715d26c9d\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Thu Feb 22 03:45:27 2024 -0600\n", + "\n", + " lec10 updated\n", + "\n", + "\u001b[33mcommit 6b7ac80ca513020f4ac43898001896aeec597179\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Thu Feb 22 01:48:04 2024 -0600\n", + "\n", + " lec9 updated again\n", + "\n", + "\u001b[33mcommit 3c0d23f6d2568da03b4ad8032e912cf2899af829\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Wed Feb 21 15:07:18 2024 -0600\n", + "\n", + " add screenshot req\n", + "\n", + "\u001b[33mcommit 5ca95af225907454cf9ffc50d6fadd2dae8838a1\u001b[m\n", + "Author: gsingh58 <gurmail-singh@cs.wisc.edu>\n", + "Date: Tue Feb 20 13:36:44 2024 -0600\n", + "\n", + " lec9 solution updated\n", + "\n", + "\u001b[33mcommit eb9f52e5c56b09918c76c25d431fae6e4f0aca59\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Tue Feb 20 09:32:16 2024 -0600\n", + "\n", + " add lab4 req\n", + "\n", + "\u001b[33mcommit 5caf51ade3733e981b801edd007155334dd70a29\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Tue Feb 20 07:37:59 2024 -0600\n", + "\n", + " Lec9 updated HTML removed\n", + "\n", + "\u001b[33mcommit e8915e7e2c1daef12cf968949c7641afa11c0758\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Tue Feb 20 07:33:57 2024 -0600\n", + "\n", + " lec9 updated\n", + "\n", + "\u001b[33mcommit 15b3e1d7c9ea971f161b2fb4df4f253a45588fe8\u001b[m\n", + "Author: gsingh58 <gurmail-singh@cs.wisc.edu>\n", + "Date: Thu Feb 15 13:45:20 2024 -0600\n", + "\n", + " a starter.ipynb file deleted\n", + "\n", + "\u001b[33mcommit 13ab0a06e5fa6ad7918b3564ab92d8e4a1914be1\u001b[m\n", + "Author: gsingh58 <gurmail-singh@cs.wisc.edu>\n", + "Date: Thu Feb 15 13:44:18 2024 -0600\n", + "\n", + " lec8 slides added\n", + "\n", + "\u001b[33mcommit faf0945797857eff185318c4146fdd00b57acd87\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Wed Feb 14 15:37:26 2024 -0600\n", + "\n", + " add lab5\n", + "\n", + "\u001b[33mcommit 19cd94a871641d13e77c58b03bfa50b53645473d\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Tue Feb 13 10:35:16 2024 -0600\n", + "\n", + " add exams\n", + "\n", + "\u001b[33mcommit 960fccecbdc787ecbe01fbe805e2d58255a47e35\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Tue Feb 13 03:49:40 2024 -0600\n", + "\n", + " Add new directory\n", + "\n", + "\u001b[33mcommit aa18ba4a4daa4c07a794b95aa647d33cbc4b21fc\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Tue Feb 13 03:48:21 2024 -0600\n", + "\n", + " Add new directory\n", + "\n", + "\u001b[33mcommit bf9845e5336eb0db7f309b55665c7bc46035bbf4\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Tue Feb 13 03:48:08 2024 -0600\n", + "\n", + " Add new directory\n", + "\n", + "\u001b[33mcommit 64fd3ae4f7677738f7fa88f75434674fb5b673fb\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Tue Feb 13 03:47:08 2024 -0600\n", + "\n", + " Add new directory\n", + "\n", + "\u001b[33mcommit 7daae2c37e007f641988e96e8252281a926089dd\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Tue Feb 13 03:40:01 2024 -0600\n", + "\n", + " worksheet add\n", + "\n", + "\u001b[33mcommit 489b6fba8db75a0b2ca55af0e774d21bdcf53447\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Tue Feb 13 03:23:22 2024 -0600\n", + "\n", + " readings moved\n", + "\n", + "\u001b[33mcommit 84ec381168bf2faefc4013883ecad4041ca97dd7\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Tue Feb 13 02:45:35 2024 -0600\n", + "\n", + " lec7 and 8 updated\n", + "\n", + "\u001b[33mcommit 4eb6f5c315386e467751a7636dbfc3ecce49aa40\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Thu Feb 8 14:19:12 2024 -0600\n", + "\n", + " add lab4\n", + "\n", + "\u001b[33mcommit ffce38e0a69d68e43186e37a74f9c5db7838d57a\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Thu Feb 8 08:14:20 2024 -0600\n", + "\n", + " lec6 updated\n", + "\n", + "\u001b[33mcommit 0436dacdd2a05927ee0cdc6b4f21c6af8bfd0f7f\u001b[m\n", + "Author: gsingh58 <gurmail-singh@cs.wisc.edu>\n", + "Date: Tue Feb 6 16:21:28 2024 -0600\n", + "\n", + " in_class_demo_lec2\n", + "\n", + "\u001b[33mcommit a5926f4b1152a2d1851a0c0d46e10337ec2e3307\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Tue Feb 6 13:20:11 2024 -0600\n", + "\n", + " update lab2\n", + "\n", + "\u001b[33mcommit e46ee7eb2210f5f4dceae060fe876f65bfaf9291\u001b[m\n", + "Author: gsingh58 <gurmail-singh@cs.wisc.edu>\n", + "Date: Tue Feb 6 11:23:25 2024 -0600\n", + "\n", + " wi.zip uploaded\n", + "\n", + "\u001b[33mcommit a58c5160b1df3fde7bb3a61efe0fb488d48a4f1e\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Tue Feb 6 08:16:41 2024 -0600\n", + "\n", + " lec5 notes updated\n", + "\n", + "\u001b[33mcommit aa58287550760afedc5d42c6d6a7418e256fc216\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Tue Feb 6 07:30:46 2024 -0600\n", + "\n", + " lec5 slides updated\n", + "\n", + "\u001b[33mcommit 775fe85700e8ea29573f379b762337214b0672a1\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Mon Feb 5 17:17:27 2024 -0600\n", + "\n", + " update lab2\n", + "\n", + "\u001b[33mcommit 513506b35bef83e8c9bdfd9101f3dd7e492f6630\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sun Feb 4 19:34:38 2024 -0600\n", + "\n", + " add lab3\n", + "\n", + "\u001b[33mcommit a171f55cba1a8ff24be8235638b36394974f5f4e\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sun Feb 4 19:30:18 2024 -0600\n", + "\n", + " Update file README.md\n", + "\n", + "\u001b[33mcommit c205681624d12c40cda836a9aa7bfecdfb9e1b5e\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sun Feb 4 19:29:13 2024 -0600\n", + "\n", + " Update file README.md\n", + "\n", + "\u001b[33mcommit 1961c8e880f7b8bea9fe276fe5cfeaf070c8fda3\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sun Feb 4 19:27:30 2024 -0600\n", + "\n", + " update lab2\n", + "\n", + "\u001b[33mcommit d6f156b6680afce23e971fa02c7550b7ed1c8464\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sun Feb 4 19:27:00 2024 -0600\n", + "\n", + " update lab2\n", + "\n", + "\u001b[33mcommit 9933dd1f67182d29af40d794604474a942ac8d05\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sun Feb 4 19:13:10 2024 -0600\n", + "\n", + " add lab2\n", + "\n", + "\u001b[33mcommit ad8795c898efc2630a667e90db4fb3b198a1f281\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Sun Feb 4 07:52:55 2024 -0600\n", + "\n", + " redundant repos deleted\n", + "\n", + "\u001b[33mcommit 0c4c4e75d8efdcb5aaf5d0ef39e27b7dc7d3baf1\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Thu Feb 1 07:17:47 2024 -0600\n", + "\n", + " lec4 updated\n", + "\n", + "\u001b[33mcommit 78d0d8c28355c33713d8dc3a9cb271c9dc13b9fb\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Thu Feb 1 07:16:55 2024 -0600\n", + "\n", + " few more files added\n", + "\n", + "\u001b[33mcommit e15b0eae22fc127b87d155207ccf93687075429b\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Wed Jan 31 09:35:12 2024 -0600\n", + "\n", + " update lab1\n", + "\n", + "\u001b[33mcommit f6bf4ed07310ec20bdc81b07a7b84e931abc1d52\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Tue Jan 30 10:03:13 2024 -0600\n", + "\n", + " Update lab1.md\n", + "\n", + "\u001b[33mcommit 6ab88b7c35171871a9824fbde72b4886990744fc\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Tue Jan 30 08:18:02 2024 -0600\n", + "\n", + " a typo removed\n", + "\n", + "\u001b[33mcommit eb8dbdafb79b044801352602a23cce59e0cf2f5b\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Tue Jan 30 07:55:27 2024 -0600\n", + "\n", + " slides name changed\n", + "\n", + "\u001b[33mcommit acaeff6953d1faf55deac9ec23605368d8423407\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Tue Jan 30 07:45:22 2024 -0600\n", + "\n", + " html file added\n", + "\n", + "\u001b[33mcommit d8ef60dc6f2e014366c4878c4811019b2704da84\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Tue Jan 30 07:37:33 2024 -0600\n", + "\n", + " lec3 updated\n", + "\n", + "\u001b[33mcommit d7f3849918a34536c532fa2dbad09199dd06ed15\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sat Jan 27 17:26:22 2024 -0600\n", + "\n", + " add lab1\n", + "\n", + "\u001b[33mcommit 845fad19e5e68b7ba37b8d9814d0d7391d28475d\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sat Jan 27 17:24:03 2024 -0600\n", + "\n", + " add lab1\n", + "\n", + "\u001b[33mcommit 3e1e4395110c0bd26b3f8cd638cd2987585e84db\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sat Jan 27 17:23:06 2024 -0600\n", + "\n", + " add lab1\n", + "\n", + "\u001b[33mcommit 34bcc1d6d1451a19515690ee2ccdbfd2b2bfd9c5\u001b[m\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sat Jan 27 17:22:04 2024 -0600\n", + "\n", + " add lab1\n", + "\n", + "\u001b[33mcommit 8199778a4ca97efeac99f1a091f2f2038d3eabd2\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Fri Jan 26 21:33:46 2024 -0600\n", + "\n", + " Add new directory\n", + "\n", + "\u001b[33mcommit 75866c80ee6a563bc31cd2182bdafd7371d01f62\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Fri Jan 26 21:33:36 2024 -0600\n", + "\n", + " Add new directory\n", + "\n", + "\u001b[33mcommit 8cd6029682fbe3253e1b045dc5c2523f99946e7e\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Fri Jan 26 21:33:26 2024 -0600\n", + "\n", + " Add new directory\n", + "\n", + "\u001b[33mcommit 99b66918e05eef2a193fee51f6159bf1268d361b\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Fri Jan 26 21:33:17 2024 -0600\n", + "\n", + " Add new directory\n", + "\n", + "\u001b[33mcommit 63c27755fdb1c938a8c55a9152970cba77f08ff9\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Fri Jan 26 21:27:17 2024 -0600\n", + "\n", + " Add new directory\n", + "\n", + "\u001b[33mcommit 5ab60fd384fd1d3934b864d8d74358491c4c1cbc\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Fri Jan 26 21:26:12 2024 -0600\n", + "\n", + " Add new directory\n", + "\n", + "\u001b[33mcommit 66e8e7233ecfd7e2493c07c5ff8e6006d20bb24c\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Fri Jan 26 21:15:22 2024 -0600\n", + "\n", + " Add new directory\n", + "\n", + "\u001b[33mcommit ad08df7aa2e0bc2396dfe580c76f8b386b0248ff\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Fri Jan 26 20:25:44 2024 -0600\n", + "\n", + " Add new directory\n", + "\n", + "\u001b[33mcommit 86f9157696f95c31e4d914b4e89fb5974f537e87\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Fri Jan 26 20:25:29 2024 -0600\n", + "\n", + " Add new directory\n", + "\n", + "\u001b[33mcommit fe4faf3439ab3b8caab1748762d83891187f54e9\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Fri Jan 26 20:14:28 2024 -0600\n", + "\n", + " Add new directory\n", + "\n", + "\u001b[33mcommit bfb93b54fb43fc18d6dacd22865416f32027c0bb\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Fri Jan 26 20:14:06 2024 -0600\n", + "\n", + " Add new directory\n", + "\n", + "\u001b[33mcommit ed7d967ede5422f9f3e2eeba85a105ffa2d03db0\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Thu Jan 25 08:10:04 2024 -0600\n", + "\n", + " lec2 py file added\n", + "\n", + "\u001b[33mcommit 81f38731851c3b844e0d4855f3ccfbb259579d2f\u001b[m\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Thu Jan 25 07:38:47 2024 -0600\n", + "\n", + " lec2 update\n", + "\n", + "\u001b[33mcommit 92279f04219c9d9fdcf504f10e3cb7f41b9a9c3a\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Tue Jan 23 04:23:37 2024 -0600\n", + "\n", + " Configure SAST in `.gitlab-ci.yml`, creating this file if it does not already exist\n", + "\n", + "\u001b[33mcommit 4721ddd9ae732b4ca058962aa3df2eb1614f45b0\u001b[m\n", + "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", + "Date: Tue Jan 23 04:23:36 2024 -0600\n", + "\n", + " Initial commit\n" + ] + } + ], + "source": [ + "!git log" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "927369bc-21f0-4e72-9133-36df6f0563dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "commit d53c732e0da3ad0cd625c25e02703face75511ee\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Mon Mar 18 20:43:56 2024 -0500\n", + "\n", + " zip folders added to lec17\n", + "\n", + "commit 0eba23b48835a617b53191de5aa8f2709517bb7a\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Mon Mar 18 20:36:44 2024 -0500\n", + "\n", + " lec16 and lec17 updated\n", + "\n", + "commit 844963cd59c769ef379b0272140b554aee6d9e60\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Thu Mar 14 13:00:40 2024 -0500\n", + "\n", + " lab11\n", + "\n", + "commit 31882131674f49276c66a6764113d53\n" + ] + } + ], + "source": [ + "git_log_output = str(check_output([\"git\", \"log\"]), encoding=\"utf-8\")\n", + "print(git_log_output[:500])" + ] + }, + { + "cell_type": "markdown", + "id": "a2bbc9ca-ceb7-42ec-9c26-a2852e403787", + "metadata": {}, + "source": [ + "#### GOAL: find all the commit numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "e02fd638-6aa4-46ce-88a9-c2cc7445e7de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['d53c732e0da3ad0cd625c25e02703face75511ee',\n", + " '0eba23b48835a617b53191de5aa8f2709517bb7a',\n", + " '844963cd59c769ef379b0272140b554aee6d9e60',\n", + " '31882131674f49276c66a6764113d5347721f239',\n", + " '08b3243a9ed2a67cdcfcbbb080910f8c86716712',\n", + " 'f5b5fff2409b01c2e12f3a7baae131e9a23d964b',\n", + " 'e017cc00d97679786d97c469733117c016e3ba9b',\n", + " '7897fec9ecf701e61b2ed2713fabe72e726ffd7c',\n", + " 'c13cfb21e69230b393ef2051027d029322c12cac',\n", + " '95bfde13c281a6eb28f0cd98f6dc393762732d4c']" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commits = re.findall(r\"[0-9a-f]{40}\", git_log_output)\n", + "# recent 10 commit numbers\n", + "commits[:10]" + ] + }, + { + "cell_type": "markdown", + "id": "08155a24-8193-4ed4-b15b-92a50da7dc61", + "metadata": {}, + "source": [ + "#### What days of the week does the team push things into this repo?" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "0bfc004b-7ed4-4396-976b-ec9aa4376320", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "commit d53c732e0da3ad0cd625c25e02703face75511ee\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Mon Mar 18 20:43:56 2024 -0500\n", + "\n", + " zip folders added to lec17\n", + "\n", + "commit 0eba23b48835a617b53191de5aa8f2709517bb7a\n", + "Author: gsingh58 <gurmail-singh@wisc.edu>\n", + "Date: Mon Mar 18 20:36:44 2024 -0500\n", + "\n", + " lec16 and lec17 updated\n", + "\n", + "commit 844963cd59c769ef379b0272140b554aee6d9e60\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Thu Mar 14 13:00:40 2024 -0500\n", + "\n", + " lab11\n", + "\n", + "commit 31882131674f49276c66a6764113d53\n" + ] + } + ], + "source": [ + "print(git_log_output[:500])" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "b3ce06a0-d6b6-43d2-aabe-66e0adf831d9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Mon',\n", + " 'Mon',\n", + " 'Thu',\n", + " 'Thu',\n", + " 'Wed',\n", + " 'Wed',\n", + " 'Wed',\n", + " 'Tue',\n", + " 'Mon',\n", + " 'Sat',\n", + " 'Sat',\n", + " 'Sat',\n", + " 'Thu',\n", + " 'Thu',\n", + " 'Tue',\n", + " 'Sun',\n", + " 'Sun',\n", + " 'Wed',\n", + " 'Wed',\n", + " 'Wed',\n", + " 'Wed',\n", + " 'Tue',\n", + " 'Thu',\n", + " 'Thu',\n", + " 'Thu',\n", + " 'Wed',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Thu',\n", + " 'Thu',\n", + " 'Wed',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Thu',\n", + " 'Thu',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Mon',\n", + " 'Sun',\n", + " 'Sun',\n", + " 'Sun',\n", + " 'Sun',\n", + " 'Sun',\n", + " 'Sun',\n", + " 'Sun',\n", + " 'Thu',\n", + " 'Thu',\n", + " 'Wed',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Tue',\n", + " 'Sat',\n", + " 'Sat',\n", + " 'Sat',\n", + " 'Sat',\n", + " 'Fri',\n", + " 'Fri',\n", + " 'Fri',\n", + " 'Fri',\n", + " 'Fri',\n", + " 'Fri',\n", + " 'Fri',\n", + " 'Fri',\n", + " 'Fri',\n", + " 'Fri',\n", + " 'Fri',\n", + " 'Thu',\n", + " 'Thu',\n", + " 'Tue',\n", + " 'Tue']" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "days = re.findall(r\"Date:\\s+(\\w+)\", git_log_output)\n", + "days" + ] + }, + { + "cell_type": "markdown", + "id": "9b4edce3-9b91-4acb-8408-a8d1e6c350a3", + "metadata": {}, + "source": [ + "#### Count unique days" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "8fd158b5-dfbb-4ede-8c6d-7c61427f9bf7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Tue 27\n", + "Thu 15\n", + "Fri 11\n", + "Wed 10\n", + "Sun 9\n", + "Sat 7\n", + "Mon 4\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "day_counts = pd.Series(days).value_counts()\n", + "day_counts" + ] + }, + { + "cell_type": "markdown", + "id": "de22ffd2-0c5c-4040-979a-f44011055f48", + "metadata": {}, + "source": [ + "#### Sort by day of the week" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "70e98a92-02e6-489e-9406-2404755eed40", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Mon 4\n", + "Tue 27\n", + "Wed 10\n", + "Thu 15\n", + "Fri 11\n", + "Sun 9\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted_day_counts = day_counts.loc[[\"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sun\"]]\n", + "sorted_day_counts" + ] + }, + { + "cell_type": "markdown", + "id": "d2fccecd-7e1a-446c-86ce-368427825fd0", + "metadata": {}, + "source": [ + "#### Create a bar plot" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "0ec72c6e-59ed-4b26-8b33-5690c27088dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 0, 'Days of the week')" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ax = sorted_day_counts.plot.bar()\n", + "ax.set_ylabel(\"Commit counts\")\n", + "ax.set_xlabel(\"Days of the week\")" + ] + }, + { + "cell_type": "markdown", + "id": "0a73668f-ed23-4eb1-9c78-519e5a2b48cb", + "metadata": {}, + "source": [ + "#### Find all commit authors names." + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "4aab2ca9-f459-4aad-8177-2be1185b9564", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'gsingh58'" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "authors = re.findall(r\"Author:\\s+(.+?)\\s*<\", git_log_output)\n", + "authors[0]" + ] + }, + { + "cell_type": "markdown", + "id": "6fca3840-b836-4555-9e0d-cd8e9b730ea2", + "metadata": {}, + "source": [ + "#### `git log` from projects repo" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "eebd36f3-1d2d-485f-a86c-44dc7767b365", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "commit 129a4745b416e3f0be08795dca69d02d528fe893\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Mon Mar 11 10:37:16 2024 -0500\n", + "\n", + " Update file README.md\n", + "\n", + "commit 413d84dceb0f48e111b25d9f7765513181feb6d6\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sat Mar 9 18:46:18 2024 -0600\n", + "\n", + " Update 2 files\n", + " \n", + " - /Labs/Lab10/README\n", + " - /Labs/Lab10/README.md\n", + "\n", + "commit bd2acf092cfeacdc994dac733300ab61a3373b26\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sat Mar 9 18:45:42 2024 -0600\n", + "\n", + " lab10\n", + "\n", + "commit f84c2a89a44d374da385bb499738ec82b12b7965\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sat Mar 9 18:42:28 2024 -0600\n", + "\n", + " Update file EDGAR.md\n", + "\n", + "commit 11b505faae9964182b99288210f54bae5ce3e211\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sat Mar 9 18:41:35 2024 -0600\n", + "\n", + " Update file README.md\n", + "\n", + "commit 5f35a23cf70d24e627fef5fd89c0711cb144dbc4\n", + "Author: JINLANG WANG <jwang2775@wisc.edu>\n", + "Date: Sat Mar 9 18:41:06 2024 -0600\n", + "\n", + " lab9\n", + "\n", + "commit d481d4de35443a07812af9216d6883300207ae6\n" + ] + } + ], + "source": [ + "git_log_output = str(check_output([\"git\", \"log\"], cwd=\"../../projects-and-labs\"), encoding=\"utf-8\")\n", + "print(git_log_output[:1000])" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "234c0230-ccc5-44a5-b975-2e37b4637444", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[' P4 pipeline update',\n", + " ' P4 released',\n", + " ' fixing pipeline and adding backup mp3 tester file to ease confusion',\n", + " ' mp3 install help',\n", + " ' pipeline update for MP3',\n", + " ' p3 pipeline changes',\n", + " ' P4 pipeline setup',\n", + " ' updating MP3 tester.py and MP3 pipeline',\n", + " ' P3 released',\n", + " ' P3 released',\n", + " ' P2 key updated',\n", + " ' MP2 Update/Fix to the tester',\n", + " ' MP2 key fix + readme update',\n", + " ' mp1 readme updated',\n", + " ' P2 Release',\n", + " ' gitlab tutorial + mp1 release',\n", + " ' initial commit (P1)']" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re.findall(r\".*[pP][1-6].*\", git_log_output)" + ] + }, + { + "cell_type": "markdown", + "id": "0801ffab-e486-4b56-a8f2-4520a8117e98", + "metadata": {}, + "source": [ + "### Emails example" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "7f232c86-1f9a-44f1-9266-a8c4920cdbef", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Gurmail [Instructor] - gsingh58(AT) cs.wisc.edu\n", + "Jinlang [Head TA] - jwang2775 (AT) wisc.edu\n", + "Elliot [TA] - eepickens (AT) cs.wisc.edu\n", + "Alex [TA] - aclinton (AT) wisc.edu\n", + "Bowman [TA] - bnbrown3 (AT) wisc.edu\n", + "Hafeez [TA] - aneesali (AT) wisc.edu\n", + "William [TA] - wycong (AT) wisc.edu\n", + "Someone [PM] - someone@wisc.edu\n", + "\n" + ] + } + ], + "source": [ + "s = \"\"\"\n", + "Gurmail [Instructor] - gsingh58(AT) cs.wisc.edu\n", + "Jinlang [Head TA] - jwang2775 (AT) wisc.edu\n", + "Elliot [TA] - eepickens (AT) cs.wisc.edu\n", + "Alex [TA] - aclinton (AT) wisc.edu\n", + "Bowman [TA] - bnbrown3 (AT) wisc.edu\n", + "Hafeez [TA] - aneesali (AT) wisc.edu\n", + "William [TA] - wycong (AT) wisc.edu\n", + "Someone [PM] - someone@wisc.edu\n", + "\"\"\"\n", + "print(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "e9ab8445-333a-4db9-a6a5-0f7a1c3d5b45", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('gsingh58(AT) cs.wisc.edu',\n", + " 'gsingh58',\n", + " '(AT)',\n", + " '(AT)',\n", + " 'cs.wisc.edu',\n", + " 'wisc.',\n", + " 'edu'),\n", + " ('jwang2775 (AT) wisc.edu',\n", + " 'jwang2775',\n", + " '(AT)',\n", + " '(AT)',\n", + " 'wisc.edu',\n", + " '',\n", + " 'edu'),\n", + " ('eepickens (AT) cs.wisc.edu',\n", + " 'eepickens',\n", + " '(AT)',\n", + " '(AT)',\n", + " 'cs.wisc.edu',\n", + " 'wisc.',\n", + " 'edu'),\n", + " ('aclinton (AT) wisc.edu', 'aclinton', '(AT)', '(AT)', 'wisc.edu', '', 'edu'),\n", + " ('bnbrown3 (AT) wisc.edu', 'bnbrown3', '(AT)', '(AT)', 'wisc.edu', '', 'edu'),\n", + " ('aneesali (AT) wisc.edu', 'aneesali', '(AT)', '(AT)', 'wisc.edu', '', 'edu'),\n", + " ('wycong (AT) wisc.edu', 'wycong', '(AT)', '(AT)', 'wisc.edu', '', 'edu'),\n", + " ('someone@wisc.edu', 'someone', '@', '', 'wisc.edu', '', 'edu')]" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "name = r\"\\w+\"\n", + "at = r\"@|([\\(\\[]?[Aa][Tt][\\)\\]]?)\"\n", + "domain = r\"\\w+\\.(\\w+\\.)?(edu|com|org|net|io|gov)\"\n", + "\n", + "full_regex = f\"(({name})\\s*({at})\\s*({domain}))\"\n", + "\n", + "re.findall(full_regex, s)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "290b5fa7-b5ea-4e61-9d67-0862d65199ad", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "REGEX: ((\\w+)\\s*(@|([\\(\\[]?[Aa][Tt][\\)\\]]?))\\s*(\\w+\\.(\\w+\\.)?(edu|com|org|net|io|gov)))\n", + "gsingh58@cs.wisc.edu\n", + "jwang2775@wisc.edu\n", + "eepickens@cs.wisc.edu\n", + "aclinton@wisc.edu\n", + "bnbrown3@wisc.edu\n", + "aneesali@wisc.edu\n", + "wycong@wisc.edu\n", + "someone@wisc.edu\n" + ] + } + ], + "source": [ + "print(\"REGEX:\", full_regex)\n", + "for match in re.findall(full_regex, s):\n", + " print(match[1] + \"@\" + match[4])" + ] + }, + { + "cell_type": "markdown", + "id": "1d808a52-3312-4d25-8a18-b3cb49c341ed", + "metadata": {}, + "source": [ + "### Self-practice\n", + "\n", + "Q1: Which regex will NOT match \"123\"\n", + "1. r\"\\d\\d\\d\"\n", + "2. r\"\\d{3}\"\n", + "3. r\"\\D\\D\\D\"\n", + "4. r\"...\"\n", + "\n", + "Q2: What will r\"^A\" match?\n", + "1. \"A\"\n", + "2. \"^A\"\n", + "3. \"BA\"\n", + "4. \"B\"\n", + "5. \"BB\"\n", + "\n", + "Q3: Which one can match \"HH\"?\n", + "1. r\"HA+H\"\n", + "2. r\"HA+?H\"\n", + "3. r\"H(A+)?H\"\n", + "\n", + "Q4: Which string(s) will match r\"^(ha)*$\"\n", + "1. \"\"\n", + "2. \"hahah\"\n", + "3. \"that\"\n", + "4. \"HAHA\"\n", + "\n", + "Q5: What is the type of the following?re.findall(r\"(\\d) (\\w+)\", some_str)[0]\n", + "1. list\n", + "2. tuple\n", + "3. string\n", + "\n", + "Q6: What will it do?\n", + "```python\n", + "re.sub(r\"(\\d{3})-(\\d{3}-\\d{4})\",\n", + " r\"(\\g<1>) \\g<2>\",\n", + " \"608-123-4567\")\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "f15a75e9-b5a0-46ae-a84e-08fd345bcb71", + "metadata": {}, + "source": [ + "The answers of these questions can be found in self_practice.ipynb. You may want to try to answer these questions yourself and then verify your answers." + ] } ], "metadata": { diff --git a/lecture_material/14-web-4-and-regex-1/regex_1_lec_001.ipynb b/lecture_material/15-regex/regex_lec_001.ipynb similarity index 98% rename from lecture_material/14-web-4-and-regex-1/regex_1_lec_001.ipynb rename to lecture_material/15-regex/regex_lec_001.ipynb index 5a44224e43a7102f13a9cce57cc6df52ef774ec6..594a3f8867eaf377d4c5e23bf1fd2a19528d420e 100644 --- a/lecture_material/14-web-4-and-regex-1/regex_1_lec_001.ipynb +++ b/lecture_material/15-regex/regex_lec_001.ipynb @@ -10,7 +10,7 @@ "id": "1b1bb64b", "metadata": {}, "source": [ - "# Regex 1\n", + "# Regex\n", "\n", "## Reading\n", "\n", @@ -103,7 +103,9 @@ "outputs": [], "source": [ "#import statements\n", - "import re" + "import re\n", + "from subprocess import check_output\n", + "import pandas as pd" ] }, { @@ -952,6 +954,315 @@ "source": [ "print(re.sub(r\"(\\d+)\", \"<b>\\g<1></b>\", msg))" ] + }, + { + "cell_type": "markdown", + "id": "c0a4d8c9-722b-490a-b913-f1d9dedb83c3", + "metadata": {}, + "source": [ + "In CS <b>320</b>, there are <b>28</b> lectures, <b>11</b> quizzes, <b>3</b> exams, <b>6</b> projects, and <b>1000</b> things to learn. CS <b>320</b> is awesome!" + ] + }, + { + "cell_type": "markdown", + "id": "1cf71b2a-98fc-462e-a689-818f1f3a3317", + "metadata": {}, + "source": [ + "### Git log example" + ] + }, + { + "cell_type": "markdown", + "id": "eacdeac4-176e-4588-9250-651627a9df39", + "metadata": {}, + "source": [ + "#### Run `git log` as a shell command" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76052b01-1363-46f8-8c6a-d7a21de893cc", + "metadata": {}, + "outputs": [], + "source": [ + "!git log" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d2fe250-7333-4612-8ce1-be224d3826ab", + "metadata": {}, + "outputs": [], + "source": [ + "git_log_output = str(check_output([\"git\", \"log\"]), encoding=\"utf-8\")\n", + "print(git_log_output[:500])" + ] + }, + { + "cell_type": "markdown", + "id": "a122d61e-daed-469d-b68c-f0c4e5a9d0c1", + "metadata": {}, + "source": [ + "#### GOAL: find all the commit numbers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f06f214-f6d1-49a5-a14e-f83fb4ba9a29", + "metadata": {}, + "outputs": [], + "source": [ + "commits = re.findall(r\"\", git_log_output)\n", + "# recent 10 commit numbers\n", + "commits[:10]" + ] + }, + { + "cell_type": "markdown", + "id": "1996d812-51c3-4d0f-aa0b-a867c6aec4d7", + "metadata": {}, + "source": [ + "#### What days of the week does the team push things into this repo?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eeaac309-3b8d-46ab-9072-a0f061007277", + "metadata": {}, + "outputs": [], + "source": [ + "print(git_log_output[:500])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f358c85-9abf-4dfe-9d0a-9f9914093dbf", + "metadata": {}, + "outputs": [], + "source": [ + "days = re.findall(r\"\", git_log_output)\n", + "days" + ] + }, + { + "cell_type": "markdown", + "id": "5e4ab9be-020d-4274-83e7-7564f45c4917", + "metadata": {}, + "source": [ + "#### Count unique days" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "768dd6c5-ce85-4d43-b930-6e0911d8d95c", + "metadata": {}, + "outputs": [], + "source": [ + "day_counts = pd.Series(days).value_counts()\n", + "day_counts" + ] + }, + { + "cell_type": "markdown", + "id": "96ee87eb-0eeb-4360-b74a-0f77e4085157", + "metadata": {}, + "source": [ + "#### Sort by day of the week" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c0b9e19-5986-446b-9280-25b4d5d1c0e5", + "metadata": {}, + "outputs": [], + "source": [ + "sorted_day_counts = day_counts.loc[[\"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sun\"]]\n", + "sorted_day_counts" + ] + }, + { + "cell_type": "markdown", + "id": "02b735e7-562a-48de-ab1c-70a36ed4afa6", + "metadata": {}, + "source": [ + "#### Create a bar plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9826d060-dc83-4949-bf79-58d77ad0bf39", + "metadata": {}, + "outputs": [], + "source": [ + "ax = sorted_day_counts.plot.bar()\n", + "ax.set_ylabel(\"Commit counts\")\n", + "ax.set_xlabel(\"Days of the week\")" + ] + }, + { + "cell_type": "markdown", + "id": "1649e14e-c867-46be-a1bc-cc4eab80ff1a", + "metadata": {}, + "source": [ + "#### Find all commit authors names." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2105c891-c00b-4e00-b61f-109933ea9f83", + "metadata": {}, + "outputs": [], + "source": [ + "authors = re.findall(r\"\", git_log_output)\n", + "authors[0]" + ] + }, + { + "cell_type": "markdown", + "id": "a19905c2-c296-458d-8d6c-8563439f1621", + "metadata": {}, + "source": [ + "#### `git log` from projects repo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1317631c-e695-4158-a83c-d3e3edfc15f6", + "metadata": {}, + "outputs": [], + "source": [ + "git_log_output = str(check_output([\"git\", \"log\"], cwd=\"../../projects-and-labs\"), encoding=\"utf-8\")\n", + "print(git_log_output[:1000])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cf0f370-c8c5-4ff0-979f-d55f2a3bbaef", + "metadata": {}, + "outputs": [], + "source": [ + "re.findall(r\"\", git_log_output)" + ] + }, + { + "cell_type": "markdown", + "id": "d3199138-3104-44dd-a3ca-9f775817dad8", + "metadata": {}, + "source": [ + "### Emails example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6e6dae7-a374-40b1-a044-9091c51b60d2", + "metadata": {}, + "outputs": [], + "source": [ + "s = \"\"\"\n", + "Gurmail [Instructor] - gsingh58(AT) cs.wisc.edu\n", + "Jinlang [Head TA] - jwang2775 (AT) wisc.edu\n", + "Elliot [TA] - eepickens (AT) cs.wisc.edu\n", + "Alex [TA] - aclinton (AT) wisc.edu\n", + "Bowman [TA] - bnbrown3 (AT) wisc.edu\n", + "Hafeez [TA] - aneesali (AT) wisc.edu\n", + "William [TA] - wycong (AT) wisc.edu\n", + "\"\"\"\n", + "print(s)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df1c2095-5a58-4896-a3f5-e5e6a28fc4b6", + "metadata": {}, + "outputs": [], + "source": [ + "name = r\"\\w+\"\n", + "at = r\"@|([\\(\\[]?[Aa][Tt][\\)\\]]?)\"\n", + "domain = r\"\\w+\\.(\\w+\\.)?(edu|com|org|net|io|gov)\"\n", + "\n", + "full_regex = f\"(({name})\\s*({at})\\s*({domain}))\"\n", + "\n", + "re.findall(full_regex, s)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a778af2-0bbe-44e3-85f3-c868543ddc76", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"REGEX:\", full_regex)\n", + "for match in re.findall(full_regex, s):\n", + " print(match[1] + \"@\" + match[4])" + ] + }, + { + "cell_type": "markdown", + "id": "a1d55737-24a0-40bb-bedd-7125a921e7e7", + "metadata": {}, + "source": [ + "### Self-practice\n", + "\n", + "Q1: Which regex will NOT match \"123\"\n", + "1. r\"\\d\\d\\d\"\n", + "2. r\"\\d{3}\"\n", + "3. r\"\\D\\D\\D\"\n", + "4. r\"...\"\n", + "\n", + "Q2: What will r\"^A\" match?\n", + "1. \"A\"\n", + "2. \"^A\"\n", + "3. \"BA\"\n", + "4. \"B\"\n", + "5. \"BB\"\n", + "\n", + "Q3: Which one can match \"HH\"?\n", + "1. r\"HA+H\"\n", + "2. r\"HA+?H\"\n", + "3. r\"H(A+)?H\"\n", + "\n", + "Q4: Which string(s) will match r\"^(ha)*$\"\n", + "1. \"\"\n", + "2. \"hahah\"\n", + "3. \"that\"\n", + "4. \"HAHA\"\n", + "\n", + "Q5: What is the type of the following?re.findall(r\"(\\d) (\\w+)\", some_str)[0]\n", + "1. list\n", + "2. tuple\n", + "3. string\n", + "\n", + "Q6: What will it do?\n", + "```python\n", + "re.sub(r\"(\\d{3})-(\\d{3}-\\d{4})\",\n", + " r\"(\\g<1>) \\g<2>\",\n", + " \"608-123-4567\")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a76c9298-2460-4572-8656-e197ca1636ca", + "metadata": {}, + "outputs": [], + "source": [ + "The answers of these questions can be found in self_practice.ipynb. You may want to try to answer these questions yourself and then verify your answers." + ] } ], "metadata": { diff --git a/lecture_material/14-web-4-and-regex-1/regex_1_lec_002.ipynb b/lecture_material/15-regex/regex_lec_002.ipynb similarity index 98% rename from lecture_material/14-web-4-and-regex-1/regex_1_lec_002.ipynb rename to lecture_material/15-regex/regex_lec_002.ipynb index 5a44224e43a7102f13a9cce57cc6df52ef774ec6..594a3f8867eaf377d4c5e23bf1fd2a19528d420e 100644 --- a/lecture_material/14-web-4-and-regex-1/regex_1_lec_002.ipynb +++ b/lecture_material/15-regex/regex_lec_002.ipynb @@ -10,7 +10,7 @@ "id": "1b1bb64b", "metadata": {}, "source": [ - "# Regex 1\n", + "# Regex\n", "\n", "## Reading\n", "\n", @@ -103,7 +103,9 @@ "outputs": [], "source": [ "#import statements\n", - "import re" + "import re\n", + "from subprocess import check_output\n", + "import pandas as pd" ] }, { @@ -952,6 +954,315 @@ "source": [ "print(re.sub(r\"(\\d+)\", \"<b>\\g<1></b>\", msg))" ] + }, + { + "cell_type": "markdown", + "id": "c0a4d8c9-722b-490a-b913-f1d9dedb83c3", + "metadata": {}, + "source": [ + "In CS <b>320</b>, there are <b>28</b> lectures, <b>11</b> quizzes, <b>3</b> exams, <b>6</b> projects, and <b>1000</b> things to learn. CS <b>320</b> is awesome!" + ] + }, + { + "cell_type": "markdown", + "id": "1cf71b2a-98fc-462e-a689-818f1f3a3317", + "metadata": {}, + "source": [ + "### Git log example" + ] + }, + { + "cell_type": "markdown", + "id": "eacdeac4-176e-4588-9250-651627a9df39", + "metadata": {}, + "source": [ + "#### Run `git log` as a shell command" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76052b01-1363-46f8-8c6a-d7a21de893cc", + "metadata": {}, + "outputs": [], + "source": [ + "!git log" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d2fe250-7333-4612-8ce1-be224d3826ab", + "metadata": {}, + "outputs": [], + "source": [ + "git_log_output = str(check_output([\"git\", \"log\"]), encoding=\"utf-8\")\n", + "print(git_log_output[:500])" + ] + }, + { + "cell_type": "markdown", + "id": "a122d61e-daed-469d-b68c-f0c4e5a9d0c1", + "metadata": {}, + "source": [ + "#### GOAL: find all the commit numbers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f06f214-f6d1-49a5-a14e-f83fb4ba9a29", + "metadata": {}, + "outputs": [], + "source": [ + "commits = re.findall(r\"\", git_log_output)\n", + "# recent 10 commit numbers\n", + "commits[:10]" + ] + }, + { + "cell_type": "markdown", + "id": "1996d812-51c3-4d0f-aa0b-a867c6aec4d7", + "metadata": {}, + "source": [ + "#### What days of the week does the team push things into this repo?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eeaac309-3b8d-46ab-9072-a0f061007277", + "metadata": {}, + "outputs": [], + "source": [ + "print(git_log_output[:500])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f358c85-9abf-4dfe-9d0a-9f9914093dbf", + "metadata": {}, + "outputs": [], + "source": [ + "days = re.findall(r\"\", git_log_output)\n", + "days" + ] + }, + { + "cell_type": "markdown", + "id": "5e4ab9be-020d-4274-83e7-7564f45c4917", + "metadata": {}, + "source": [ + "#### Count unique days" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "768dd6c5-ce85-4d43-b930-6e0911d8d95c", + "metadata": {}, + "outputs": [], + "source": [ + "day_counts = pd.Series(days).value_counts()\n", + "day_counts" + ] + }, + { + "cell_type": "markdown", + "id": "96ee87eb-0eeb-4360-b74a-0f77e4085157", + "metadata": {}, + "source": [ + "#### Sort by day of the week" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c0b9e19-5986-446b-9280-25b4d5d1c0e5", + "metadata": {}, + "outputs": [], + "source": [ + "sorted_day_counts = day_counts.loc[[\"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sun\"]]\n", + "sorted_day_counts" + ] + }, + { + "cell_type": "markdown", + "id": "02b735e7-562a-48de-ab1c-70a36ed4afa6", + "metadata": {}, + "source": [ + "#### Create a bar plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9826d060-dc83-4949-bf79-58d77ad0bf39", + "metadata": {}, + "outputs": [], + "source": [ + "ax = sorted_day_counts.plot.bar()\n", + "ax.set_ylabel(\"Commit counts\")\n", + "ax.set_xlabel(\"Days of the week\")" + ] + }, + { + "cell_type": "markdown", + "id": "1649e14e-c867-46be-a1bc-cc4eab80ff1a", + "metadata": {}, + "source": [ + "#### Find all commit authors names." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2105c891-c00b-4e00-b61f-109933ea9f83", + "metadata": {}, + "outputs": [], + "source": [ + "authors = re.findall(r\"\", git_log_output)\n", + "authors[0]" + ] + }, + { + "cell_type": "markdown", + "id": "a19905c2-c296-458d-8d6c-8563439f1621", + "metadata": {}, + "source": [ + "#### `git log` from projects repo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1317631c-e695-4158-a83c-d3e3edfc15f6", + "metadata": {}, + "outputs": [], + "source": [ + "git_log_output = str(check_output([\"git\", \"log\"], cwd=\"../../projects-and-labs\"), encoding=\"utf-8\")\n", + "print(git_log_output[:1000])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cf0f370-c8c5-4ff0-979f-d55f2a3bbaef", + "metadata": {}, + "outputs": [], + "source": [ + "re.findall(r\"\", git_log_output)" + ] + }, + { + "cell_type": "markdown", + "id": "d3199138-3104-44dd-a3ca-9f775817dad8", + "metadata": {}, + "source": [ + "### Emails example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6e6dae7-a374-40b1-a044-9091c51b60d2", + "metadata": {}, + "outputs": [], + "source": [ + "s = \"\"\"\n", + "Gurmail [Instructor] - gsingh58(AT) cs.wisc.edu\n", + "Jinlang [Head TA] - jwang2775 (AT) wisc.edu\n", + "Elliot [TA] - eepickens (AT) cs.wisc.edu\n", + "Alex [TA] - aclinton (AT) wisc.edu\n", + "Bowman [TA] - bnbrown3 (AT) wisc.edu\n", + "Hafeez [TA] - aneesali (AT) wisc.edu\n", + "William [TA] - wycong (AT) wisc.edu\n", + "\"\"\"\n", + "print(s)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df1c2095-5a58-4896-a3f5-e5e6a28fc4b6", + "metadata": {}, + "outputs": [], + "source": [ + "name = r\"\\w+\"\n", + "at = r\"@|([\\(\\[]?[Aa][Tt][\\)\\]]?)\"\n", + "domain = r\"\\w+\\.(\\w+\\.)?(edu|com|org|net|io|gov)\"\n", + "\n", + "full_regex = f\"(({name})\\s*({at})\\s*({domain}))\"\n", + "\n", + "re.findall(full_regex, s)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a778af2-0bbe-44e3-85f3-c868543ddc76", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"REGEX:\", full_regex)\n", + "for match in re.findall(full_regex, s):\n", + " print(match[1] + \"@\" + match[4])" + ] + }, + { + "cell_type": "markdown", + "id": "a1d55737-24a0-40bb-bedd-7125a921e7e7", + "metadata": {}, + "source": [ + "### Self-practice\n", + "\n", + "Q1: Which regex will NOT match \"123\"\n", + "1. r\"\\d\\d\\d\"\n", + "2. r\"\\d{3}\"\n", + "3. r\"\\D\\D\\D\"\n", + "4. r\"...\"\n", + "\n", + "Q2: What will r\"^A\" match?\n", + "1. \"A\"\n", + "2. \"^A\"\n", + "3. \"BA\"\n", + "4. \"B\"\n", + "5. \"BB\"\n", + "\n", + "Q3: Which one can match \"HH\"?\n", + "1. r\"HA+H\"\n", + "2. r\"HA+?H\"\n", + "3. r\"H(A+)?H\"\n", + "\n", + "Q4: Which string(s) will match r\"^(ha)*$\"\n", + "1. \"\"\n", + "2. \"hahah\"\n", + "3. \"that\"\n", + "4. \"HAHA\"\n", + "\n", + "Q5: What is the type of the following?re.findall(r\"(\\d) (\\w+)\", some_str)[0]\n", + "1. list\n", + "2. tuple\n", + "3. string\n", + "\n", + "Q6: What will it do?\n", + "```python\n", + "re.sub(r\"(\\d{3})-(\\d{3}-\\d{4})\",\n", + " r\"(\\g<1>) \\g<2>\",\n", + " \"608-123-4567\")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a76c9298-2460-4572-8656-e197ca1636ca", + "metadata": {}, + "outputs": [], + "source": [ + "The answers of these questions can be found in self_practice.ipynb. You may want to try to answer these questions yourself and then verify your answers." + ] } ], "metadata": { diff --git a/lecture_material/15-regex_2/self_practice.ipynb b/lecture_material/15-regex/self_practice.ipynb similarity index 100% rename from lecture_material/15-regex_2/self_practice.ipynb rename to lecture_material/15-regex/self_practice.ipynb diff --git a/lecture_material/15-regex_2/regex_2.ipynb b/lecture_material/15-regex_2/regex_2.ipynb deleted file mode 100644 index 623d72461d7b75a77c129d08ff070d364c0d74bb..0000000000000000000000000000000000000000 --- a/lecture_material/15-regex_2/regex_2.ipynb +++ /dev/null @@ -1,1960 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "e60c1c48", - "metadata": {}, - "source": [ - "# Regex 2" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "0dba68b0", - "metadata": {}, - "outputs": [], - "source": [ - "#import statements\n", - "import re\n", - "from subprocess import check_output\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "b97c8008-8e39-4a9f-89a1-0d1ddbb1ac01", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "A DAG is a directed graph without cycles. A tree is a DAG where every node has one parent (except the root, which has none). To learn more, visit www.example.com or call 1-608-123-4567. :) ¯\\_(ツ)_/¯\n", - "1-608-123-4567\n", - "a-bcd-efg-hijg (not a phone number)\n", - "1-608-123-456 (not a phone number)\n", - "608-123-4567\n", - "123-4567\n", - "1-123-4567 (not a phone number)\n", - "\n", - "In CS 320, there are 11 quizzes, 6 projects, 28 lectures, and 1000 things to learn. CS 320 is awesome!\n", - "In CS 320, there are 11 quizzes, 6 projects,\n", - "28 lectures, and 1000 things to learn. CS 320 is awesome!\n" - ] - } - ], - "source": [ - "# Example strings\n", - "# from DS100 book...\n", - "def reg(regex, text):\n", - " \"\"\"\n", - " Prints the string with the regex match highlighted.\n", - " \"\"\"\n", - " print(re.sub(f'({regex})', r'\\033[1;30;43m\\1\\033[m', text))\n", - "s1 = \" \".join([\"A DAG is a directed graph without cycles.\",\n", - " \"A tree is a DAG where every node has one parent (except the root, which has none).\",\n", - " \"To learn more, visit www.example.com or call 1-608-123-4567. :) ¯\\_(ツ)_/¯\"])\n", - "print(s1)\n", - "\n", - "s2 = \"\"\"1-608-123-4567\n", - "a-bcd-efg-hijg (not a phone number)\n", - "1-608-123-456 (not a phone number)\n", - "608-123-4567\n", - "123-4567\n", - "1-123-4567 (not a phone number)\n", - "\"\"\"\n", - "print(s2)\n", - "\n", - "s3 = \"In CS 320, there are 11 quizzes, 6 projects, 28 lectures, and 1000 things to learn. CS 320 is awesome!\"\n", - "print(s3)\n", - "\n", - "s4 = \"\"\"In CS 320, there are 11 quizzes, 6 projects,\n", - "28 lectures, and 1000 things to learn. CS 320 is awesome!\"\"\"\n", - "print(s4)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "924069c5-82be-4423-b659-2beee8e226be", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "A DAG is a directed graph without cycles. A tree is a DAG where every node has one parent (except the root, which has none). To learn more, visit www.example.com or call 1-608-123-4567. :) ¯\\_(ツ)_/¯\n" - ] - } - ], - "source": [ - "print(s1)" - ] - }, - { - "cell_type": "markdown", - "id": "6a2eff34", - "metadata": {}, - "source": [ - "### Regex is case sensitive\n", - "\n", - "### Character classes\n", - "\n", - "- Character classes can be mentioned within `[...]`\n", - "- `^` means `NOT` of a character class\n", - "- `-` enables us to mention range of characters, for example `[A-Z]`\n", - "- `|` enables us to perform `OR`\n", - "\n", - "### Metacharacters\n", - "\n", - "- predefined character classes\n", - " - `\\d` => digits\n", - " - `\\s` => whitespace (space, tab, newline)\n", - " - `\\w` => \"word\" characters (digits, letters, underscores, etc) --- helpful for variable name matches and whole word matches (as it doesn't match whitespace --- `\\s`)\n", - " - `.` => wildcard: anything except newline\n", - "- capitalized version of character classes mean `NOT`, for example `\\D` => everything except digits\n", - "\n", - "### REPETITION\n", - "\n", - "- `<character>{<num matches>}` - for example: `w{3}`\n", - "- matches cannot overlap\n", - "\n", - "### Variable length repitition operators\n", - "\n", - "- `*` => 0 or more (greedy: match as many characters as possible)\n", - "- `+` => 1 or more (greedy: match as many characters as possible)\n", - "- `?` => 0 or 1\n", - "- `*?` => 0 or more (non-greedy: match as few characters as possible)\n", - "- `+?` => 1 or more (non-greedy: match as few characters as possible)\n", - "\n", - "#### Find everything inside of parentheses." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "11f75d92-f215-47f4-8804-5e5727d3be55", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;30;43mA DAG is a directed graph without cycles. A tree is a DAG where every node has one parent (except the root, which has none). To learn more, visit www.example.com or call 1-608-123-4567. :) ¯\\_(ツ)_/¯\u001b[m\u001b[1;30;43m\u001b[m\n" - ] - } - ], - "source": [ - "# this doesn't work\n", - "# it captures everything because () have special meaning (coming up)\n", - "reg(r\"(.*)\", s1)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "b488460e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "A DAG is a directed graph without cycles. A tree is a DAG where every node has one parent \u001b[1;30;43m(except the root, which has none). To learn more, visit www.example.com or call 1-608-123-4567. :) ¯\\_(ツ)\u001b[m_/¯\n" - ] - } - ], - "source": [ - "# How can we change this to not use special meaning of ()?\n", - "# * is greedy: match as many characters as possible\n", - "reg(r\"\\(.*\\)\", s1)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "42155b54-d27b-4418-81a1-c005c508d738", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "A DAG is a directed graph without cycles. A tree is a DAG where every node has one parent \u001b[1;30;43m(except the root, which has none)\u001b[m. To learn more, visit www.example.com or call 1-608-123-4567. :) ¯\\_\u001b[1;30;43m(ツ)\u001b[m_/¯\n" - ] - } - ], - "source": [ - "# non-greedy: stop at the first possible spot instead of the last possible spot\n", - "reg(r\"\\(.*?\\)\", s1)" - ] - }, - { - "cell_type": "markdown", - "id": "0fd70cfd", - "metadata": {}, - "source": [ - "### Anchor characters\n", - "- `^` => start of string\n", - " - `^` is overloaded --- what was the other usage?\n", - "- `$` => end of string\n", - "\n", - "#### Find everything in the first sentence." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "40fed5db", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;30;43mA DAG is a directed graph without cycles.\u001b[m\u001b[1;30;43m A tree is a DAG where every node has one parent (except the root, which has none).\u001b[m\u001b[1;30;43m To learn more, visit www.\u001b[m\u001b[1;30;43mexample.\u001b[m\u001b[1;30;43mcom or call 1-608-123-4567.\u001b[m :) ¯\\_(ツ)_/¯\n" - ] - } - ], - "source": [ - "# doesn't work because remember regex finds all possible matches\n", - "# so it matches every single sentence \n", - "# (even though we are doing non-greedy match)\n", - "reg(r\".*?\\.\", s1)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "7e97abd8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;30;43mA DAG is a directed graph without cycles.\u001b[m A tree is a DAG where every node has one parent (except the root, which has none). To learn more, visit www.example.com or call 1-608-123-4567. :) ¯\\_(ツ)_/¯\n" - ] - } - ], - "source": [ - "reg(r\"^.*?\\.\", s1)" - ] - }, - { - "cell_type": "markdown", - "id": "f66a4651", - "metadata": {}, - "source": [ - "#### Find everything in the first two sentences." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "cfa8353e-4402-4f64-b3f8-35e73b2d7a68", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;30;43mA DAG is a directed graph without cycles. A tree is a DAG where every node has one parent (except the root, which has none).\u001b[m To learn more, visit www.example.com or call 1-608-123-4567. :) ¯\\_(ツ)_/¯\n" - ] - } - ], - "source": [ - "reg(r\"^(.*?\\.){2}\", s1)" - ] - }, - { - "cell_type": "markdown", - "id": "76570acd", - "metadata": {}, - "source": [ - "#### Find last \"word\" in the sentence." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "f35a6c81-47b5-48eb-8357-888fe832f85b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "A DAG is a directed graph without cycles. A tree is a DAG where every node has one parent (except the root, which has none). To learn more, visit www.example.com or call 1-608-123-4567. :) \u001b[1;30;43m¯\\_(ツ)_/¯\u001b[m\n" - ] - } - ], - "source": [ - "reg(r\"\\S+$\", s1)" - ] - }, - { - "cell_type": "markdown", - "id": "4b25fb66", - "metadata": {}, - "source": [ - "### Case study: find all phone numbers." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "8ecbeaf0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1-608-123-4567\n", - "a-bcd-efg-hijg (not a phone number)\n", - "1-608-123-456 (not a phone number)\n", - "608-123-4567\n", - "123-4567\n", - "1-123-4567 (not a phone number)\n", - "\n" - ] - } - ], - "source": [ - "print(s2)\n", - "# The country code (1) in the front is optional\n", - "# The area code (608) is also optional\n", - "# Doesn't make sense to match country code without area code though!" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "88a01725-790c-4f21-b334-e3d31bed24b5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;30;43m1-608-123-4567\u001b[m\n", - "a-bcd-efg-hijg (not a phone number)\n", - "1-608-123-456 (not a phone number)\n", - "608-123-4567\n", - "123-4567\n", - "1-123-4567 (not a phone number)\n", - "\n" - ] - } - ], - "source": [ - "# Full US phone numbers\n", - "reg(r\"\\d-\\d{3}-\\d{3}-\\d{4}\", s2)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "ea8ce0be-98f8-4f9c-bcc0-1d8a0bb183a8", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;30;43m1-608-123-4567\u001b[m\n", - "a-bcd-efg-hijg (not a phone number)\n", - "1-608-123-456 (not a phone number)\n", - "\u001b[1;30;43m608-123-4567\u001b[m\n", - "123-4567\n", - "1-123-4567 (not a phone number)\n", - "\n" - ] - } - ], - "source": [ - "# The country code (1) in the front is optional\n", - "reg(r\"(\\d-)?\\d{3}-\\d{3}-\\d{4}\", s2)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "5befb23a-848f-44d4-aaac-6b21ac0bbf43", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;30;43m1-608-123-4567\u001b[m\n", - "a-bcd-efg-hijg (not a phone number)\n", - "1-608-123-456 (not a phone number)\n", - "\u001b[1;30;43m608-123-4567\u001b[m\n", - "\u001b[1;30;43m123-4567\u001b[m\n", - "\u001b[1;30;43m1-123-4567\u001b[m (not a phone number)\n", - "\n" - ] - } - ], - "source": [ - "# The area code (608) is also optional\n", - "# Doesn't make sense to have country code without area code though!\n", - "reg(r\"(\\d-)?(\\d{3}-)?\\d{3}-\\d{4}\", s2)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "34164fd1-a422-45a0-8e11-54199ca77120", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;30;43m1-608-123-4567\u001b[m\n", - "a-bcd-efg-hijg (not a phone number)\n", - "1-608-123-456 (not a phone number)\n", - "\u001b[1;30;43m608-123-4567\u001b[m\n", - "\u001b[1;30;43m123-4567\u001b[m\n", - "1-\u001b[1;30;43m123-4567\u001b[m (not a phone number)\n", - "\n" - ] - } - ], - "source": [ - "# This is good enough for 320 quizzes/tests\n", - "# But clearly, the last match is not correct\n", - "reg(r\"((\\d-)?\\d{3}-)?\\d{3}-\\d{4}\", s2)" - ] - }, - { - "cell_type": "markdown", - "id": "8a2ee4e2", - "metadata": {}, - "source": [ - "Regex documentation link: https://docs.python.org/3/library/re.html." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "694a585b-b5a7-4a6f-a0f1-60521f7dfc47", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;30;43m1-608-123-4567\u001b[m\n", - "a-bcd-efg-hijg (not a phone number)\n", - "1-608-123-456 (not a phone number)\n", - "\u001b[1;30;43m608-123-4567\u001b[m\n", - "\u001b[1;30;43m123-4567\u001b[m\n", - "1-123-4567 (not a phone number)\n", - "\n" - ] - } - ], - "source": [ - "# BONUS: negative lookbehind (I won't test this)\n", - "reg(r\"(?<!\\d\\-)((\\d-)?\\d{3}-)?\\d{3}-\\d{4}\", s2)" - ] - }, - { - "cell_type": "markdown", - "id": "3973350b", - "metadata": {}, - "source": [ - "There is also a negative lookahead. For example, how to avoid matching \"1-608-123-456\" in \"1-608-123-4569999\". You can explore this if you are interested." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "4988d765", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;30;43m608-123-4569\u001b[m999\n" - ] - } - ], - "source": [ - "reg(r\"(?<!\\d\\-)((\\d-)?\\d{3}-)?\\d{3}-\\d{4}\", \"608-123-4569999\")" - ] - }, - { - "cell_type": "markdown", - "id": "b02ae9e0", - "metadata": {}, - "source": [ - "### Testing your regex\n", - "- you could use `reg(...)` function\n", - "- another useful resource: https://regex101.com/" - ] - }, - { - "cell_type": "markdown", - "id": "4a973271", - "metadata": {}, - "source": [ - "### `re` module\n", - "- `re.findall(<PATTERN>, <SEARCH STRING>)`: regular expression matches\n", - " - returns a list of strings \n", - "- `re.sub(<PATTERN>, <REPLACEMENT>, <SEARCH STRING>)`: regular expression match + substitution\n", - " - returns a new string with the substitutions (remember strings are immutable)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "73ec525f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "In CS 320,\tthere are 28 lectures, 11 quizzes, 3 exams,\t6 projects, and 1000 things to learn. CS 320 is awesome!\n" - ] - } - ], - "source": [ - "msg = \"In CS 320,\\tthere are 28 lectures, 11 quizzes, 3 exams,\\t6 projects, and 1000 things to learn. CS 320 is awesome!\"\n", - "print(msg)" - ] - }, - { - "cell_type": "markdown", - "id": "34998a5e", - "metadata": {}, - "source": [ - "#### Find all digits." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "7f42c25a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['320', '28', '11', '3', '6', '1000', '320']" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "re.findall(r\"\\d+\", msg)" - ] - }, - { - "cell_type": "markdown", - "id": "70b2f488", - "metadata": {}, - "source": [ - "### Groups\n", - "- we can capture matches using `()` => this is the special meaning of `()`\n", - "- returns a list of tuples, where length of the tuple will be number of groups\n", - "\n", - "#### Find all digits and the word that comes after that." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "5309adee", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('28', 'lectures'),\n", - " ('11', 'quizzes'),\n", - " ('3', 'exams'),\n", - " ('6', 'projects'),\n", - " ('1000', 'things'),\n", - " ('320', 'is')]" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "matches = re.findall(r\"(\\d+) (\\w+)\", msg)\n", - "matches" - ] - }, - { - "cell_type": "markdown", - "id": "c138ff9a", - "metadata": {}, - "source": [ - "#### Goal: make a dict (course component => count, like \"projects\" => 6)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "6654bd71", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'lectures': 28,\n", - " 'quizzes': 11,\n", - " 'exams': 3,\n", - " 'projects': 6,\n", - " 'things': 1000,\n", - " 'is': 320}" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "course_dict = {}\n", - "for count, component in matches:\n", - " course_dict[component] = int(count)\n", - "course_dict" - ] - }, - { - "cell_type": "markdown", - "id": "c4b6b505", - "metadata": {}, - "source": [ - "### Unlike matches, groups can overlap\n", - "\n", - "#### Find and group all digits and the word that comes after that." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "491c3460", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('28 lectures', '28', 'lectures'),\n", - " ('11 quizzes', '11', 'quizzes'),\n", - " ('3 exams', '3', 'exams'),\n", - " ('6 projects', '6', 'projects'),\n", - " ('1000 things', '1000', 'things'),\n", - " ('320 is', '320', 'is')]" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "re.findall(r\"((\\d+) (\\w+))\", msg)" - ] - }, - { - "cell_type": "markdown", - "id": "d2227e69", - "metadata": {}, - "source": [ - "#### Substitute all digits with \"###\"." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "6d1fede1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'In CS ###,\\tthere are ### lectures, ### quizzes, ### exams,\\t### projects, and ### things to learn. CS ### is awesome!'" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "re.sub(r\"\\d+\", \"###\", msg)" - ] - }, - { - "cell_type": "markdown", - "id": "9d531122", - "metadata": {}, - "source": [ - "#### Goal: normalize whitespace (everything will be a single space)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "4becbe70", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "In CS 320,\tthere are 28 lectures, 11 quizzes, 3 exams,\t6 projects, and 1000 things to learn. CS 320 is awesome!\n" - ] - } - ], - "source": [ - "print(msg)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "72a6eb42", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'In CS 320, there are 28 lectures, 11 quizzes, 3 exams, 6 projects, and 1000 things to learn. CS 320 is awesome!'" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "re.sub(r\"\\s+\", \" \", msg)" - ] - }, - { - "cell_type": "markdown", - "id": "6faf33fd", - "metadata": {}, - "source": [ - "### How to use groups is substitution?\n", - "- `\\g<N>` gives you the result of the N'th grouping.\n", - "\n", - "#### Substitute all course component counts with HTML bold tags." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "8df577fd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "In CS <b>320</b>,\tthere are <b>28</b> lectures, <b>11</b> quizzes, <b>3</b> exams,\t<b>6</b> projects, and <b>1000</b> things to learn. CS <b>320</b> is awesome!\n" - ] - } - ], - "source": [ - "print(re.sub(r\"(\\d+)\", \"<b>\\g<1></b>\", msg))" - ] - }, - { - "cell_type": "markdown", - "id": "35a15a41", - "metadata": {}, - "source": [ - "In CS <b>320</b>, there are <b>28</b> lectures, <b>10</b> quizzes, <b>3</b> exams, <b>6</b> projects, and <b>1000</b> things to learn. CS <b>320</b> is awesome!" - ] - }, - { - "cell_type": "markdown", - "id": "bede932a", - "metadata": {}, - "source": [ - "### Git log example" - ] - }, - { - "cell_type": "markdown", - "id": "f816ab34", - "metadata": {}, - "source": [ - "#### Run `git log` as a shell command" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "f55eec9a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mcommit f5b5fff2409b01c2e12f3a7baae131e9a23d964b\u001b[m\u001b[33m (\u001b[m\u001b[1;36mHEAD -> \u001b[m\u001b[1;32mmain\u001b[m\u001b[33m, \u001b[m\u001b[1;31morigin/main\u001b[m\u001b[33m, \u001b[m\u001b[1;31morigin/HEAD\u001b[m\u001b[33m)\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Wed Mar 13 09:39:09 2024 -0500\n", - "\n", - " regex_2_lec_002 renamed as regex_1_lec_002\n", - "\n", - "\u001b[33mcommit e017cc00d97679786d97c469733117c016e3ba9b\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Wed Mar 13 09:34:43 2024 -0500\n", - "\n", - " web_4_lec_002 added\n", - "\n", - "\u001b[33mcommit 7897fec9ecf701e61b2ed2713fabe72e726ffd7c\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Mar 12 05:51:48 2024 -0500\n", - "\n", - " lec14 and lec15 added\n", - "\n", - "\u001b[33mcommit c13cfb21e69230b393ef2051027d029322c12cac\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Mon Mar 11 10:36:36 2024 -0500\n", - "\n", - " Update file README.md\n", - "\n", - "\u001b[33mcommit 95bfde13c281a6eb28f0cd98f6dc393762732d4c\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sat Mar 9 18:45:08 2024 -0600\n", - "\n", - " lab10\n", - "\n", - "\u001b[33mcommit 90111df9e72309597c67da15bb61de78bff126e7\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sat Mar 9 18:43:47 2024 -0600\n", - "\n", - " lab9\n", - "\n", - "\u001b[33mcommit c401ade096b0dd3e1178f46d067b2fbb98d499f3\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sat Mar 9 18:43:20 2024 -0600\n", - "\n", - " Update 2 files\n", - " \n", - " - /Labs/Lab9/EDGAR.md\n", - " - /Labs/Lab9/README.md\n", - "\n", - "\u001b[33mcommit c99c65b5efdaa91f8fce8fadffc41c4747e0a3a0\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Thu Mar 7 08:36:02 2024 -0600\n", - "\n", - " lec13 updated\n", - "\n", - "\u001b[33mcommit df5877233cc57005e3003c19c0dbf89aafc53804\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Thu Mar 7 07:02:16 2024 -0600\n", - "\n", - " lec12 & 13 updated\n", - "\n", - "\u001b[33mcommit 26470de563bbe3bc65a73718aee1988ba64e8601\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Mar 5 05:22:26 2024 -0600\n", - "\n", - " lec12 and 13: ipynb files added\n", - "\n", - "\u001b[33mcommit 406a4231cd865f81653eb97f7d5f61136360f9e5\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sun Mar 3 17:41:42 2024 -0600\n", - "\n", - " Update file README.md\n", - "\n", - "\u001b[33mcommit 64745e91caeed8149cb33be037b795f88b4ddd39\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Sun Mar 3 05:03:29 2024 -0600\n", - "\n", - " lec12 and lec13 updated\n", - "\n", - "\u001b[33mcommit 42568d125bb87a10c178ec1d4640e467f2d826b0\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Wed Feb 28 22:31:11 2024 -0600\n", - "\n", - " lab8\n", - "\n", - "\u001b[33mcommit e00ff203a601993ec07c23420ff223707228e820\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Wed Feb 28 22:26:10 2024 -0600\n", - "\n", - " lab6\n", - "\n", - "\u001b[33mcommit 408cff34905add795fb9cce796913372b184a60b\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Wed Feb 28 22:25:47 2024 -0600\n", - "\n", - " add lab7\n", - "\n", - "\u001b[33mcommit dd5c119d6d72ee039a3eb10608d2f57f56933d72\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Wed Feb 28 22:22:15 2024 -0600\n", - "\n", - " add lab6\n", - "\n", - "\u001b[33mcommit 27ce693c05fe8beae4bd2b15715454c6c3c3f3f7\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Feb 27 00:24:06 2024 -0600\n", - "\n", - " lec11 updated\n", - "\n", - "\u001b[33mcommit 8fda6507879ad4a220675d7af1dcc4f2dc6eb8cb\u001b[m\n", - "Author: gsingh58 <gurmail-singh@cs.wisc.edu>\n", - "Date: Thu Feb 22 16:57:25 2024 -0600\n", - "\n", - " redundant html files deleted\n", - "\n", - "\u001b[33mcommit 3693ad5b1f9ff5e2ed5af15baac3579715d26c9d\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Thu Feb 22 03:45:27 2024 -0600\n", - "\n", - " lec10 updated\n", - "\n", - "\u001b[33mcommit 6b7ac80ca513020f4ac43898001896aeec597179\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Thu Feb 22 01:48:04 2024 -0600\n", - "\n", - " lec9 updated again\n", - "\n", - "\u001b[33mcommit 3c0d23f6d2568da03b4ad8032e912cf2899af829\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Wed Feb 21 15:07:18 2024 -0600\n", - "\n", - " add screenshot req\n", - "\n", - "\u001b[33mcommit 5ca95af225907454cf9ffc50d6fadd2dae8838a1\u001b[m\n", - "Author: gsingh58 <gurmail-singh@cs.wisc.edu>\n", - "Date: Tue Feb 20 13:36:44 2024 -0600\n", - "\n", - " lec9 solution updated\n", - "\n", - "\u001b[33mcommit eb9f52e5c56b09918c76c25d431fae6e4f0aca59\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Tue Feb 20 09:32:16 2024 -0600\n", - "\n", - " add lab4 req\n", - "\n", - "\u001b[33mcommit 5caf51ade3733e981b801edd007155334dd70a29\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Feb 20 07:37:59 2024 -0600\n", - "\n", - " Lec9 updated HTML removed\n", - "\n", - "\u001b[33mcommit e8915e7e2c1daef12cf968949c7641afa11c0758\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Feb 20 07:33:57 2024 -0600\n", - "\n", - " lec9 updated\n", - "\n", - "\u001b[33mcommit 15b3e1d7c9ea971f161b2fb4df4f253a45588fe8\u001b[m\n", - "Author: gsingh58 <gurmail-singh@cs.wisc.edu>\n", - "Date: Thu Feb 15 13:45:20 2024 -0600\n", - "\n", - " a starter.ipynb file deleted\n", - "\n", - "\u001b[33mcommit 13ab0a06e5fa6ad7918b3564ab92d8e4a1914be1\u001b[m\n", - "Author: gsingh58 <gurmail-singh@cs.wisc.edu>\n", - "Date: Thu Feb 15 13:44:18 2024 -0600\n", - "\n", - " lec8 slides added\n", - "\n", - "\u001b[33mcommit faf0945797857eff185318c4146fdd00b57acd87\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Wed Feb 14 15:37:26 2024 -0600\n", - "\n", - " add lab5\n", - "\n", - "\u001b[33mcommit 19cd94a871641d13e77c58b03bfa50b53645473d\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Tue Feb 13 10:35:16 2024 -0600\n", - "\n", - " add exams\n", - "\n", - "\u001b[33mcommit 960fccecbdc787ecbe01fbe805e2d58255a47e35\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Tue Feb 13 03:49:40 2024 -0600\n", - "\n", - " Add new directory\n", - "\n", - "\u001b[33mcommit aa18ba4a4daa4c07a794b95aa647d33cbc4b21fc\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Tue Feb 13 03:48:21 2024 -0600\n", - "\n", - " Add new directory\n", - "\n", - "\u001b[33mcommit bf9845e5336eb0db7f309b55665c7bc46035bbf4\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Tue Feb 13 03:48:08 2024 -0600\n", - "\n", - " Add new directory\n", - "\n", - "\u001b[33mcommit 64fd3ae4f7677738f7fa88f75434674fb5b673fb\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Tue Feb 13 03:47:08 2024 -0600\n", - "\n", - " Add new directory\n", - "\n", - "\u001b[33mcommit 7daae2c37e007f641988e96e8252281a926089dd\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Feb 13 03:40:01 2024 -0600\n", - "\n", - " worksheet add\n", - "\n", - "\u001b[33mcommit 489b6fba8db75a0b2ca55af0e774d21bdcf53447\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Feb 13 03:23:22 2024 -0600\n", - "\n", - " readings moved\n", - "\n", - "\u001b[33mcommit 84ec381168bf2faefc4013883ecad4041ca97dd7\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Feb 13 02:45:35 2024 -0600\n", - "\n", - " lec7 and 8 updated\n", - "\n", - "\u001b[33mcommit 4eb6f5c315386e467751a7636dbfc3ecce49aa40\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Thu Feb 8 14:19:12 2024 -0600\n", - "\n", - " add lab4\n", - "\n", - "\u001b[33mcommit ffce38e0a69d68e43186e37a74f9c5db7838d57a\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Thu Feb 8 08:14:20 2024 -0600\n", - "\n", - " lec6 updated\n", - "\n", - "\u001b[33mcommit 0436dacdd2a05927ee0cdc6b4f21c6af8bfd0f7f\u001b[m\n", - "Author: gsingh58 <gurmail-singh@cs.wisc.edu>\n", - "Date: Tue Feb 6 16:21:28 2024 -0600\n", - "\n", - " in_class_demo_lec2\n", - "\n", - "\u001b[33mcommit a5926f4b1152a2d1851a0c0d46e10337ec2e3307\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Tue Feb 6 13:20:11 2024 -0600\n", - "\n", - " update lab2\n", - "\n", - "\u001b[33mcommit e46ee7eb2210f5f4dceae060fe876f65bfaf9291\u001b[m\n", - "Author: gsingh58 <gurmail-singh@cs.wisc.edu>\n", - "Date: Tue Feb 6 11:23:25 2024 -0600\n", - "\n", - " wi.zip uploaded\n", - "\n", - "\u001b[33mcommit a58c5160b1df3fde7bb3a61efe0fb488d48a4f1e\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Feb 6 08:16:41 2024 -0600\n", - "\n", - " lec5 notes updated\n", - "\n", - "\u001b[33mcommit aa58287550760afedc5d42c6d6a7418e256fc216\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Feb 6 07:30:46 2024 -0600\n", - "\n", - " lec5 slides updated\n", - "\n", - "\u001b[33mcommit 775fe85700e8ea29573f379b762337214b0672a1\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Mon Feb 5 17:17:27 2024 -0600\n", - "\n", - " update lab2\n", - "\n", - "\u001b[33mcommit 513506b35bef83e8c9bdfd9101f3dd7e492f6630\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sun Feb 4 19:34:38 2024 -0600\n", - "\n", - " add lab3\n", - "\n", - "\u001b[33mcommit a171f55cba1a8ff24be8235638b36394974f5f4e\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sun Feb 4 19:30:18 2024 -0600\n", - "\n", - " Update file README.md\n", - "\n", - "\u001b[33mcommit c205681624d12c40cda836a9aa7bfecdfb9e1b5e\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sun Feb 4 19:29:13 2024 -0600\n", - "\n", - " Update file README.md\n", - "\n", - "\u001b[33mcommit 1961c8e880f7b8bea9fe276fe5cfeaf070c8fda3\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sun Feb 4 19:27:30 2024 -0600\n", - "\n", - " update lab2\n", - "\n", - "\u001b[33mcommit d6f156b6680afce23e971fa02c7550b7ed1c8464\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sun Feb 4 19:27:00 2024 -0600\n", - "\n", - " update lab2\n", - "\n", - "\u001b[33mcommit 9933dd1f67182d29af40d794604474a942ac8d05\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sun Feb 4 19:13:10 2024 -0600\n", - "\n", - " add lab2\n", - "\n", - "\u001b[33mcommit ad8795c898efc2630a667e90db4fb3b198a1f281\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Sun Feb 4 07:52:55 2024 -0600\n", - "\n", - " redundant repos deleted\n", - "\n", - "\u001b[33mcommit 0c4c4e75d8efdcb5aaf5d0ef39e27b7dc7d3baf1\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Thu Feb 1 07:17:47 2024 -0600\n", - "\n", - " lec4 updated\n", - "\n", - "\u001b[33mcommit 78d0d8c28355c33713d8dc3a9cb271c9dc13b9fb\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Thu Feb 1 07:16:55 2024 -0600\n", - "\n", - " few more files added\n", - "\n", - "\u001b[33mcommit e15b0eae22fc127b87d155207ccf93687075429b\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Wed Jan 31 09:35:12 2024 -0600\n", - "\n", - " update lab1\n", - "\n", - "\u001b[33mcommit f6bf4ed07310ec20bdc81b07a7b84e931abc1d52\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Tue Jan 30 10:03:13 2024 -0600\n", - "\n", - " Update lab1.md\n", - "\n", - "\u001b[33mcommit 6ab88b7c35171871a9824fbde72b4886990744fc\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Jan 30 08:18:02 2024 -0600\n", - "\n", - " a typo removed\n", - "\n", - "\u001b[33mcommit eb8dbdafb79b044801352602a23cce59e0cf2f5b\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Jan 30 07:55:27 2024 -0600\n", - "\n", - " slides name changed\n", - "\n", - "\u001b[33mcommit acaeff6953d1faf55deac9ec23605368d8423407\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Jan 30 07:45:22 2024 -0600\n", - "\n", - " html file added\n", - "\n", - "\u001b[33mcommit d8ef60dc6f2e014366c4878c4811019b2704da84\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Jan 30 07:37:33 2024 -0600\n", - "\n", - " lec3 updated\n", - "\n", - "\u001b[33mcommit d7f3849918a34536c532fa2dbad09199dd06ed15\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sat Jan 27 17:26:22 2024 -0600\n", - "\n", - " add lab1\n", - "\n", - "\u001b[33mcommit 845fad19e5e68b7ba37b8d9814d0d7391d28475d\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sat Jan 27 17:24:03 2024 -0600\n", - "\n", - " add lab1\n", - "\n", - "\u001b[33mcommit 3e1e4395110c0bd26b3f8cd638cd2987585e84db\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sat Jan 27 17:23:06 2024 -0600\n", - "\n", - " add lab1\n", - "\n", - "\u001b[33mcommit 34bcc1d6d1451a19515690ee2ccdbfd2b2bfd9c5\u001b[m\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sat Jan 27 17:22:04 2024 -0600\n", - "\n", - " add lab1\n", - "\n", - "\u001b[33mcommit 8199778a4ca97efeac99f1a091f2f2038d3eabd2\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Fri Jan 26 21:33:46 2024 -0600\n", - "\n", - " Add new directory\n", - "\n", - "\u001b[33mcommit 75866c80ee6a563bc31cd2182bdafd7371d01f62\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Fri Jan 26 21:33:36 2024 -0600\n", - "\n", - " Add new directory\n", - "\n", - "\u001b[33mcommit 8cd6029682fbe3253e1b045dc5c2523f99946e7e\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Fri Jan 26 21:33:26 2024 -0600\n", - "\n", - " Add new directory\n", - "\n", - "\u001b[33mcommit 99b66918e05eef2a193fee51f6159bf1268d361b\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Fri Jan 26 21:33:17 2024 -0600\n", - "\n", - " Add new directory\n", - "\n", - "\u001b[33mcommit 63c27755fdb1c938a8c55a9152970cba77f08ff9\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Fri Jan 26 21:27:17 2024 -0600\n", - "\n", - " Add new directory\n", - "\n", - "\u001b[33mcommit 5ab60fd384fd1d3934b864d8d74358491c4c1cbc\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Fri Jan 26 21:26:12 2024 -0600\n", - "\n", - " Add new directory\n", - "\n", - "\u001b[33mcommit 66e8e7233ecfd7e2493c07c5ff8e6006d20bb24c\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Fri Jan 26 21:15:22 2024 -0600\n", - "\n", - " Add new directory\n", - "\n", - "\u001b[33mcommit ad08df7aa2e0bc2396dfe580c76f8b386b0248ff\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Fri Jan 26 20:25:44 2024 -0600\n", - "\n", - " Add new directory\n", - "\n", - "\u001b[33mcommit 86f9157696f95c31e4d914b4e89fb5974f537e87\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Fri Jan 26 20:25:29 2024 -0600\n", - "\n", - " Add new directory\n", - "\n", - "\u001b[33mcommit fe4faf3439ab3b8caab1748762d83891187f54e9\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Fri Jan 26 20:14:28 2024 -0600\n", - "\n", - " Add new directory\n", - "\n", - "\u001b[33mcommit bfb93b54fb43fc18d6dacd22865416f32027c0bb\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Fri Jan 26 20:14:06 2024 -0600\n", - "\n", - " Add new directory\n", - "\n", - "\u001b[33mcommit ed7d967ede5422f9f3e2eeba85a105ffa2d03db0\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Thu Jan 25 08:10:04 2024 -0600\n", - "\n", - " lec2 py file added\n", - "\n", - "\u001b[33mcommit 81f38731851c3b844e0d4855f3ccfbb259579d2f\u001b[m\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Thu Jan 25 07:38:47 2024 -0600\n", - "\n", - " lec2 update\n", - "\n", - "\u001b[33mcommit 92279f04219c9d9fdcf504f10e3cb7f41b9a9c3a\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Tue Jan 23 04:23:37 2024 -0600\n", - "\n", - " Configure SAST in `.gitlab-ci.yml`, creating this file if it does not already exist\n", - "\n", - "\u001b[33mcommit 4721ddd9ae732b4ca058962aa3df2eb1614f45b0\u001b[m\n", - "Author: GURMAIL SINGH <gurmail.singh@wisc.edu>\n", - "Date: Tue Jan 23 04:23:36 2024 -0600\n", - "\n", - " Initial commit\n" - ] - } - ], - "source": [ - "!git log" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "ad6cd0a1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "commit f5b5fff2409b01c2e12f3a7baae131e9a23d964b\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Wed Mar 13 09:39:09 2024 -0500\n", - "\n", - " regex_2_lec_002 renamed as regex_1_lec_002\n", - "\n", - "commit e017cc00d97679786d97c469733117c016e3ba9b\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Wed Mar 13 09:34:43 2024 -0500\n", - "\n", - " web_4_lec_002 added\n", - "\n", - "commit 7897fec9ecf701e61b2ed2713fabe72e726ffd7c\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Mar 12 05:51:48 2024 -0500\n", - "\n", - " lec14 and lec15 added\n", - "\n", - "commit c13\n" - ] - } - ], - "source": [ - "git_log_output = str(check_output([\"git\", \"log\"]), encoding=\"utf-8\")\n", - "print(git_log_output[:500])" - ] - }, - { - "cell_type": "markdown", - "id": "f550942e", - "metadata": {}, - "source": [ - "#### GOAL: find all the commit numbers" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "33489c0f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['f5b5fff2409b01c2e12f3a7baae131e9a23d964b',\n", - " 'e017cc00d97679786d97c469733117c016e3ba9b',\n", - " '7897fec9ecf701e61b2ed2713fabe72e726ffd7c',\n", - " 'c13cfb21e69230b393ef2051027d029322c12cac',\n", - " '95bfde13c281a6eb28f0cd98f6dc393762732d4c',\n", - " '90111df9e72309597c67da15bb61de78bff126e7',\n", - " 'c401ade096b0dd3e1178f46d067b2fbb98d499f3',\n", - " 'c99c65b5efdaa91f8fce8fadffc41c4747e0a3a0',\n", - " 'df5877233cc57005e3003c19c0dbf89aafc53804',\n", - " '26470de563bbe3bc65a73718aee1988ba64e8601']" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "commits = re.findall(r\"[0-9a-f]{40}\", git_log_output)\n", - "# recent 10 commit numbers\n", - "commits[:10]" - ] - }, - { - "cell_type": "markdown", - "id": "a2d1c384", - "metadata": {}, - "source": [ - "#### What days of the week does the team push things into this repo?" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "d2243734", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "commit f5b5fff2409b01c2e12f3a7baae131e9a23d964b\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Wed Mar 13 09:39:09 2024 -0500\n", - "\n", - " regex_2_lec_002 renamed as regex_1_lec_002\n", - "\n", - "commit e017cc00d97679786d97c469733117c016e3ba9b\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Wed Mar 13 09:34:43 2024 -0500\n", - "\n", - " web_4_lec_002 added\n", - "\n", - "commit 7897fec9ecf701e61b2ed2713fabe72e726ffd7c\n", - "Author: gsingh58 <gurmail-singh@wisc.edu>\n", - "Date: Tue Mar 12 05:51:48 2024 -0500\n", - "\n", - " lec14 and lec15 added\n", - "\n", - "commit c13\n" - ] - } - ], - "source": [ - "print(git_log_output[:500])" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "40198305", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Wed',\n", - " 'Wed',\n", - " 'Tue',\n", - " 'Mon',\n", - " 'Sat',\n", - " 'Sat',\n", - " 'Sat',\n", - " 'Thu',\n", - " 'Thu',\n", - " 'Tue',\n", - " 'Sun',\n", - " 'Sun',\n", - " 'Wed',\n", - " 'Wed',\n", - " 'Wed',\n", - " 'Wed',\n", - " 'Tue',\n", - " 'Thu',\n", - " 'Thu',\n", - " 'Thu',\n", - " 'Wed',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Thu',\n", - " 'Thu',\n", - " 'Wed',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Thu',\n", - " 'Thu',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Mon',\n", - " 'Sun',\n", - " 'Sun',\n", - " 'Sun',\n", - " 'Sun',\n", - " 'Sun',\n", - " 'Sun',\n", - " 'Sun',\n", - " 'Thu',\n", - " 'Thu',\n", - " 'Wed',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Tue',\n", - " 'Sat',\n", - " 'Sat',\n", - " 'Sat',\n", - " 'Sat',\n", - " 'Fri',\n", - " 'Fri',\n", - " 'Fri',\n", - " 'Fri',\n", - " 'Fri',\n", - " 'Fri',\n", - " 'Fri',\n", - " 'Fri',\n", - " 'Fri',\n", - " 'Fri',\n", - " 'Fri',\n", - " 'Thu',\n", - " 'Thu',\n", - " 'Tue',\n", - " 'Tue']" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "days = re.findall(r\"Date:\\s+(\\w+)\", git_log_output)\n", - "days" - ] - }, - { - "cell_type": "markdown", - "id": "d04f0835", - "metadata": {}, - "source": [ - "#### Count unique days" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "24c3f4a0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Tue 27\n", - "Thu 13\n", - "Fri 11\n", - "Wed 9\n", - "Sun 9\n", - "Sat 7\n", - "Mon 2\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "day_counts = pd.Series(days).value_counts()\n", - "day_counts" - ] - }, - { - "cell_type": "markdown", - "id": "7c2c6899", - "metadata": {}, - "source": [ - "#### Sort by day of the week" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "cca2506c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Mon 2\n", - "Tue 27\n", - "Wed 9\n", - "Thu 13\n", - "Fri 11\n", - "Sun 9\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sorted_day_counts = day_counts.loc[[\"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sun\"]]\n", - "sorted_day_counts" - ] - }, - { - "cell_type": "markdown", - "id": "c7f30c6a", - "metadata": {}, - "source": [ - "#### Create a bar plot" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "18eb90d6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Text(0.5, 0, 'Days of the week')" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "<Figure size 640x480 with 1 Axes>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "ax = sorted_day_counts.plot.bar()\n", - "ax.set_ylabel(\"Commit counts\")\n", - "ax.set_xlabel(\"Days of the week\")" - ] - }, - { - "cell_type": "markdown", - "id": "42758038", - "metadata": {}, - "source": [ - "#### Find all commit authors names." - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "a87693e9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'gsingh58'" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "authors = re.findall(r\"Author:\\s+(.+?)\\s*<\", git_log_output)\n", - "authors[0]" - ] - }, - { - "cell_type": "markdown", - "id": "cbc516d7", - "metadata": {}, - "source": [ - "#### `git log` from projects repo" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "e2f85aa2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "commit 129a4745b416e3f0be08795dca69d02d528fe893\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Mon Mar 11 10:37:16 2024 -0500\n", - "\n", - " Update file README.md\n", - "\n", - "commit 413d84dceb0f48e111b25d9f7765513181feb6d6\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sat Mar 9 18:46:18 2024 -0600\n", - "\n", - " Update 2 files\n", - " \n", - " - /Labs/Lab10/README\n", - " - /Labs/Lab10/README.md\n", - "\n", - "commit bd2acf092cfeacdc994dac733300ab61a3373b26\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sat Mar 9 18:45:42 2024 -0600\n", - "\n", - " lab10\n", - "\n", - "commit f84c2a89a44d374da385bb499738ec82b12b7965\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sat Mar 9 18:42:28 2024 -0600\n", - "\n", - " Update file EDGAR.md\n", - "\n", - "commit 11b505faae9964182b99288210f54bae5ce3e211\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sat Mar 9 18:41:35 2024 -0600\n", - "\n", - " Update file README.md\n", - "\n", - "commit 5f35a23cf70d24e627fef5fd89c0711cb144dbc4\n", - "Author: JINLANG WANG <jwang2775@wisc.edu>\n", - "Date: Sat Mar 9 18:41:06 2024 -0600\n", - "\n", - " lab9\n", - "\n", - "commit d481d4de35443a07812af9216d6883300207ae6\n" - ] - } - ], - "source": [ - "git_log_output = str(check_output([\"git\", \"log\"], cwd=\"../../projects-and-labs\"), encoding=\"utf-8\")\n", - "print(git_log_output[:1000])" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "59acc090", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[' P4 pipeline update',\n", - " ' P4 released',\n", - " ' fixing pipeline and adding backup mp3 tester file to ease confusion',\n", - " ' mp3 install help',\n", - " ' pipeline update for MP3',\n", - " ' p3 pipeline changes',\n", - " ' P4 pipeline setup',\n", - " ' updating MP3 tester.py and MP3 pipeline',\n", - " ' P3 released',\n", - " ' P3 released',\n", - " ' P2 key updated',\n", - " ' MP2 Update/Fix to the tester',\n", - " ' MP2 key fix + readme update',\n", - " ' mp1 readme updated',\n", - " ' P2 Release',\n", - " ' gitlab tutorial + mp1 release',\n", - " ' initial commit (P1)']" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "re.findall(r\".*[pP][1-6].*\", git_log_output)" - ] - }, - { - "cell_type": "markdown", - "id": "29085f57", - "metadata": {}, - "source": [ - "### Emails example" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "25b005af", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Gurmail [Instructor] - gsingh58(AT) cs.wisc.edu\n", - "Jinlang [Head TA] - jwang2775 (AT) wisc.edu\n", - "Elliot [TA] - eepickens (AT) cs.wisc.edu\n", - "Alex [TA] - aclinton (AT) wisc.edu\n", - "Bowman [TA] - bnbrown3 (AT) wisc.edu\n", - "Hafeez [TA] - aneesali (AT) wisc.edu\n", - "William [TA] - wycong (AT) wisc.edu\n", - "Someone [PM] - someone@wisc.edu\n", - "\n" - ] - } - ], - "source": [ - "s = \"\"\"\n", - "Gurmail [Instructor] - gsingh58(AT) cs.wisc.edu\n", - "Jinlang [Head TA] - jwang2775 (AT) wisc.edu\n", - "Elliot [TA] - eepickens (AT) cs.wisc.edu\n", - "Alex [TA] - aclinton (AT) wisc.edu\n", - "Bowman [TA] - bnbrown3 (AT) wisc.edu\n", - "Hafeez [TA] - aneesali (AT) wisc.edu\n", - "William [TA] - wycong (AT) wisc.edu\n", - "Someone [PM] - someone@wisc.edu\n", - "\"\"\"\n", - "print(s)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "ea45c263", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('gsingh58(AT) cs.wisc.edu',\n", - " 'gsingh58',\n", - " '(AT)',\n", - " '(AT)',\n", - " 'cs.wisc.edu',\n", - " 'wisc.',\n", - " 'edu'),\n", - " ('jwang2775 (AT) wisc.edu',\n", - " 'jwang2775',\n", - " '(AT)',\n", - " '(AT)',\n", - " 'wisc.edu',\n", - " '',\n", - " 'edu'),\n", - " ('eepickens (AT) cs.wisc.edu',\n", - " 'eepickens',\n", - " '(AT)',\n", - " '(AT)',\n", - " 'cs.wisc.edu',\n", - " 'wisc.',\n", - " 'edu'),\n", - " ('aclinton (AT) wisc.edu', 'aclinton', '(AT)', '(AT)', 'wisc.edu', '', 'edu'),\n", - " ('bnbrown3 (AT) wisc.edu', 'bnbrown3', '(AT)', '(AT)', 'wisc.edu', '', 'edu'),\n", - " ('aneesali (AT) wisc.edu', 'aneesali', '(AT)', '(AT)', 'wisc.edu', '', 'edu'),\n", - " ('wycong (AT) wisc.edu', 'wycong', '(AT)', '(AT)', 'wisc.edu', '', 'edu'),\n", - " ('someone@wisc.edu', 'someone', '@', '', 'wisc.edu', '', 'edu')]" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "name = r\"\\w+\"\n", - "at = r\"@|([\\(\\[]?[Aa][Tt][\\)\\]]?)\"\n", - "domain = r\"\\w+\\.(\\w+\\.)?(edu|com|org|net|io|gov)\"\n", - "\n", - "full_regex = f\"(({name})\\s*({at})\\s*({domain}))\"\n", - "\n", - "re.findall(full_regex, s)" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "7d04d86f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "REGEX: ((\\w+)\\s*(@|([\\(\\[]?[Aa][Tt][\\)\\]]?))\\s*(\\w+\\.(\\w+\\.)?(edu|com|org|net|io|gov)))\n", - "gsingh58@cs.wisc.edu\n", - "jwang2775@wisc.edu\n", - "eepickens@cs.wisc.edu\n", - "aclinton@wisc.edu\n", - "bnbrown3@wisc.edu\n", - "aneesali@wisc.edu\n", - "wycong@wisc.edu\n", - "someone@wisc.edu\n" - ] - } - ], - "source": [ - "print(\"REGEX:\", full_regex)\n", - "for match in re.findall(full_regex, s):\n", - " print(match[1] + \"@\" + match[4])" - ] - }, - { - "cell_type": "markdown", - "id": "5aa7f04e", - "metadata": {}, - "source": [ - "### Self-practice\n", - "\n", - "Q1: Which regex will NOT match \"123\"\n", - "1. r\"\\d\\d\\d\"\n", - "2. r\"\\d{3}\"\n", - "3. r\"\\D\\D\\D\"\n", - "4. r\"...\"\n", - "\n", - "Q2: What will r\"^A\" match?\n", - "1. \"A\"\n", - "2. \"^A\"\n", - "3. \"BA\"\n", - "4. \"B\"\n", - "5. \"BB\"\n", - "\n", - "Q3: Which one can match \"HH\"?\n", - "1. r\"HA+H\"\n", - "2. r\"HA+?H\"\n", - "3. r\"H(A+)?H\"\n", - "\n", - "Q4: Which string(s) will match r\"^(ha)*$\"\n", - "1. \"\"\n", - "2. \"hahah\"\n", - "3. \"that\"\n", - "4. \"HAHA\"\n", - "\n", - "Q5: What is the type of the following?re.findall(r\"(\\d) (\\w+)\", some_str)[0]\n", - "1. list\n", - "2. tuple\n", - "3. string\n", - "\n", - "Q6: What will it do?\n", - "```python\n", - "re.sub(r\"(\\d{3})-(\\d{3}-\\d{4})\",\n", - " r\"(\\g<1>) \\g<2>\",\n", - " \"608-123-4567\")\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "3ff199e4", - "metadata": {}, - "source": [ - "The answers of these questions can be found in self_practice.ipynb. You may want to try to answer these questions yourself and then verify your answers." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/lecture_material/15-regex_2/regex_2_lec_001.ipynb b/lecture_material/15-regex_2/regex_2_lec_001.ipynb deleted file mode 100644 index db06eb7fa32b923d38c746a1408a37396c9ee014..0000000000000000000000000000000000000000 --- a/lecture_material/15-regex_2/regex_2_lec_001.ipynb +++ /dev/null @@ -1,835 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "e60c1c48", - "metadata": {}, - "source": [ - "# Regex 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0dba68b0", - "metadata": {}, - "outputs": [], - "source": [ - "#import statements\n", - "import re\n", - "from subprocess import check_output\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b97c8008-8e39-4a9f-89a1-0d1ddbb1ac01", - "metadata": {}, - "outputs": [], - "source": [ - "# Example strings\n", - "# from DS100 book...\n", - "def reg(regex, text):\n", - " \"\"\"\n", - " Prints the string with the regex match highlighted.\n", - " \"\"\"\n", - " print(re.sub(f'({regex})', r'\\033[1;30;43m\\1\\033[m', text))\n", - "s1 = \" \".join([\"A DAG is a directed graph without cycles.\",\n", - " \"A tree is a DAG where every node has one parent (except the root, which has none).\",\n", - " \"To learn more, visit www.example.com or call 1-608-123-4567. :) ¯\\_(ツ)_/¯\"])\n", - "print(s1)\n", - "\n", - "s2 = \"\"\"1-608-123-4567\n", - "a-bcd-efg-hijg (not a phone number)\n", - "1-608-123-456 (not a phone number)\n", - "608-123-4567\n", - "123-4567\n", - "1-123-4567 (not a phone number)\n", - "\"\"\"\n", - "print(s2)\n", - "\n", - "s3 = \"In CS 320, there are 11 quizzes, 6 projects, 28 lectures, and 1000 things to learn. CS 320 is awesome!\"\n", - "print(s3)\n", - "\n", - "s4 = \"\"\"In CS 320, there are 11 quizzes, 6 projects,\n", - "28 lectures, and 1000 things to learn. CS 320 is awesome!\"\"\"\n", - "print(s4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "924069c5-82be-4423-b659-2beee8e226be", - "metadata": {}, - "outputs": [], - "source": [ - "print(s1)" - ] - }, - { - "cell_type": "markdown", - "id": "6a2eff34", - "metadata": {}, - "source": [ - "### Regex is case sensitive\n", - "\n", - "### Character classes\n", - "\n", - "- Character classes can be mentioned within `[...]`\n", - "- `^` means `NOT` of a character class\n", - "- `-` enables us to mention range of characters, for example `[A-Z]`\n", - "- `|` enables us to perform `OR`\n", - "\n", - "### Metacharacters\n", - "\n", - "- predefined character classes\n", - " - `\\d` => digits\n", - " - `\\s` => whitespace (space, tab, newline)\n", - " - `\\w` => \"word\" characters (digits, letters, underscores, etc) --- helpful for variable name matches and whole word matches (as it doesn't match whitespace --- `\\s`)\n", - " - `.` => wildcard: anything except newline\n", - "- capitalized version of character classes mean `NOT`, for example `\\D` => everything except digits\n", - "\n", - "### REPETITION\n", - "\n", - "- `<character>{<num matches>}` - for example: `w{3}`\n", - "- matches cannot overlap\n", - "\n", - "### Variable length repitition operators\n", - "\n", - "- `*` => 0 or more (greedy: match as many characters as possible)\n", - "- `+` => 1 or more (greedy: match as many characters as possible)\n", - "- `?` => 0 or 1\n", - "- `*?` => 0 or more (non-greedy: match as few characters as possible)\n", - "- `+?` => 1 or more (non-greedy: match as few characters as possible)\n", - "\n", - "#### Find everything inside of parentheses." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11f75d92-f215-47f4-8804-5e5727d3be55", - "metadata": {}, - "outputs": [], - "source": [ - "# this doesn't work\n", - "# it captures everything because () have special meaning (coming up)\n", - "reg(r\"\", s1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b488460e", - "metadata": {}, - "outputs": [], - "source": [ - "# How can we change this to not use special meaning of ()?\n", - "# * is greedy: match as many characters as possible\n", - "reg(r\"(.*)\", s1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "42155b54-d27b-4418-81a1-c005c508d738", - "metadata": {}, - "outputs": [], - "source": [ - "# non-greedy: stop at the first possible spot instead of the last possible spot\n", - "reg(r\"\\(.*\\)\", s1)" - ] - }, - { - "cell_type": "markdown", - "id": "0fd70cfd", - "metadata": {}, - "source": [ - "### Anchor characters\n", - "- `^` => start of string\n", - " - `^` is overloaded --- what was the other usage?\n", - "- `$` => end of string\n", - "\n", - "#### Find everything in the first sentence." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "40fed5db", - "metadata": {}, - "outputs": [], - "source": [ - "# doesn't work because remember regex finds all possible matches\n", - "# so it matches every single sentence \n", - "# (even though we are doing non-greedy match)\n", - "reg(r\"\", s1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e97abd8", - "metadata": {}, - "outputs": [], - "source": [ - "reg(r\".*?\\.\", s1)" - ] - }, - { - "cell_type": "markdown", - "id": "f66a4651", - "metadata": {}, - "source": [ - "#### Find everything in the first two sentences." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cfa8353e-4402-4f64-b3f8-35e73b2d7a68", - "metadata": {}, - "outputs": [], - "source": [ - "reg(r\"\", s1)" - ] - }, - { - "cell_type": "markdown", - "id": "76570acd", - "metadata": {}, - "source": [ - "#### Find last \"word\" in the sentence." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f35a6c81-47b5-48eb-8357-888fe832f85b", - "metadata": {}, - "outputs": [], - "source": [ - "reg(r\"\", s1)" - ] - }, - { - "cell_type": "markdown", - "id": "4b25fb66", - "metadata": {}, - "source": [ - "### Case study: find all phone numbers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ecbeaf0", - "metadata": {}, - "outputs": [], - "source": [ - "print(s2)\n", - "# The country code (1) in the front is optional\n", - "# The area code (608) is also optional\n", - "# Doesn't make sense to match country code without area code though!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88a01725-790c-4f21-b334-e3d31bed24b5", - "metadata": {}, - "outputs": [], - "source": [ - "# Full US phone numbers\n", - "reg(r\"\", s2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea8ce0be-98f8-4f9c-bcc0-1d8a0bb183a8", - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# The country code (1) in the front is optional\n", - "reg(r\"\", s2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5befb23a-848f-44d4-aaac-6b21ac0bbf43", - "metadata": {}, - "outputs": [], - "source": [ - "# The area code (608) is also optional\n", - "# Doesn't make sense to have country code without area code though!\n", - "reg(r\"\", s2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "34164fd1-a422-45a0-8e11-54199ca77120", - "metadata": {}, - "outputs": [], - "source": [ - "# This is good enough for 320 quizzes/tests\n", - "# But clearly, the last match is not correct\n", - "reg(r\"\", s2)" - ] - }, - { - "cell_type": "markdown", - "id": "8a2ee4e2", - "metadata": {}, - "source": [ - "Regex documentation link: https://docs.python.org/3/library/re.html." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "694a585b-b5a7-4a6f-a0f1-60521f7dfc47", - "metadata": {}, - "outputs": [], - "source": [ - "# BONUS: negative lookbehind (I won't test this)\n", - "reg(r\"(?<!\\d\\-)((\\d-)?\\d{3}-)?\\d{3}-\\d{4}\", s2)" - ] - }, - { - "cell_type": "markdown", - "id": "3973350b", - "metadata": {}, - "source": [ - "There is also a negative lookahead. For example, how to avoid matching \"1-608-123-456\" in \"1-608-123-4569999\". You can explore this if you are interested." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4988d765", - "metadata": {}, - "outputs": [], - "source": [ - "reg(r\"(?<!\\d\\-)((\\d-)?\\d{3}-)?\\d{3}-\\d{4}\", \"608-123-4569999\")" - ] - }, - { - "cell_type": "markdown", - "id": "b02ae9e0", - "metadata": {}, - "source": [ - "### Testing your regex\n", - "- you could use `reg(...)` function\n", - "- another useful resource: https://regex101.com/" - ] - }, - { - "cell_type": "markdown", - "id": "4a973271", - "metadata": {}, - "source": [ - "### `re` module\n", - "- `re.findall(<PATTERN>, <SEARCH STRING>)`: regular expression matches\n", - " - returns a list of strings \n", - "- `re.sub(<PATTERN>, <REPLACEMENT>, <SEARCH STRING>)`: regular expression match + substitution\n", - " - returns a new string with the substitutions (remember strings are immutable)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "73ec525f", - "metadata": {}, - "outputs": [], - "source": [ - "msg = \"In CS 320,\\tthere are 28 lectures, 11 quizzes, 3 exams,\\t6 projects, and 1000 things to learn. CS 320 is awesome!\"\n", - "print(msg)" - ] - }, - { - "cell_type": "markdown", - "id": "34998a5e", - "metadata": {}, - "source": [ - "#### Find all digits." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f42c25a", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "70b2f488", - "metadata": {}, - "source": [ - "### Groups\n", - "- we can capture matches using `()` => this is the special meaning of `()`\n", - "- returns a list of tuples, where length of the tuple will be number of groups\n", - "\n", - "#### Find all digits and the word that comes after that." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5309adee", - "metadata": {}, - "outputs": [], - "source": [ - "matches = re.findall(r\"\", msg)\n", - "matches" - ] - }, - { - "cell_type": "markdown", - "id": "bc6a982c", - "metadata": {}, - "source": [ - "#### Goal: make a dict (course component => count, like \"projects\" => 6)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7f1a028", - "metadata": {}, - "outputs": [], - "source": [ - "course_dict = {}\n", - "for count, component in matches:\n", - " course_dict[component] = int(count)\n", - "course_dict" - ] - }, - { - "cell_type": "markdown", - "id": "c4b6b505", - "metadata": {}, - "source": [ - "### Unlike matches, groups can overlap\n", - "\n", - "#### Find and group all digits and the word that comes after that." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "491c3460", - "metadata": {}, - "outputs": [], - "source": [ - "re.findall(r\"(\\d+) (\\w+)\", msg)" - ] - }, - { - "cell_type": "markdown", - "id": "d2227e69", - "metadata": {}, - "source": [ - "#### Substitute all digits with \"###\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d1fede1", - "metadata": {}, - "outputs": [], - "source": [ - "re.sub(r\"\", , msg)" - ] - }, - { - "cell_type": "markdown", - "id": "9d531122", - "metadata": {}, - "source": [ - "#### Goal: normalize whitespace (everything will be a single space)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4becbe70", - "metadata": {}, - "outputs": [], - "source": [ - "print(msg)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "72a6eb42", - "metadata": {}, - "outputs": [], - "source": [ - "re.sub(r\"\", , msg)" - ] - }, - { - "cell_type": "markdown", - "id": "6faf33fd", - "metadata": {}, - "source": [ - "### How to use groups is substitution?\n", - "- `\\g<N>` gives you the result of the N'th grouping.\n", - "\n", - "#### Substitute all course component counts with HTML bold tags." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8df577fd", - "metadata": {}, - "outputs": [], - "source": [ - "print(re.sub(r\"(\\d+)\", \"<b></b>\", msg))" - ] - }, - { - "cell_type": "markdown", - "id": "35a15a41", - "metadata": {}, - "source": [ - "In CS <b>320</b>, there are <b>28</b> lectures, <b>11</b> quizzes, <b>3</b> exams, <b>6</b> projects, and <b>1000</b> things to learn. CS <b>320</b> is awesome!" - ] - }, - { - "cell_type": "markdown", - "id": "6b299526", - "metadata": {}, - "source": [ - "### Git log example" - ] - }, - { - "cell_type": "markdown", - "id": "a9b7261c", - "metadata": {}, - "source": [ - "#### Run `git log` as a shell command" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10e459b4", - "metadata": {}, - "outputs": [], - "source": [ - "!git log" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ef440fed", - "metadata": {}, - "outputs": [], - "source": [ - "git_log_output = str(check_output([\"git\", \"log\"]), encoding=\"utf-8\")\n", - "print(git_log_output[:500])" - ] - }, - { - "cell_type": "markdown", - "id": "5c154b46", - "metadata": {}, - "source": [ - "#### GOAL: find all the commit numbers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ea954bf", - "metadata": {}, - "outputs": [], - "source": [ - "commits = re.findall(r\"\", git_log_output)\n", - "# recent 10 commit numbers\n", - "commits[:10]" - ] - }, - { - "cell_type": "markdown", - "id": "bc485b5f", - "metadata": {}, - "source": [ - "#### What days of the week does the team push things into this repo?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "57353c44", - "metadata": {}, - "outputs": [], - "source": [ - "print(git_log_output[:500])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d1ea6f59", - "metadata": {}, - "outputs": [], - "source": [ - "days = re.findall(r\"\", git_log_output)\n", - "days" - ] - }, - { - "cell_type": "markdown", - "id": "2c7efb55", - "metadata": {}, - "source": [ - "#### Count unique days" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3c2d7207", - "metadata": {}, - "outputs": [], - "source": [ - "day_counts = pd.Series(days).value_counts()\n", - "day_counts" - ] - }, - { - "cell_type": "markdown", - "id": "7317ca35", - "metadata": {}, - "source": [ - "#### Sort by day of the week" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d5c5c58c", - "metadata": {}, - "outputs": [], - "source": [ - "sorted_day_counts = day_counts.loc[[\"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sun\"]]\n", - "sorted_day_counts" - ] - }, - { - "cell_type": "markdown", - "id": "745afee5", - "metadata": {}, - "source": [ - "#### Create a bar plot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7bb8f6f", - "metadata": {}, - "outputs": [], - "source": [ - "ax = sorted_day_counts.plot.bar()\n", - "ax.set_ylabel(\"Commit counts\")\n", - "ax.set_xlabel(\"Days of the week\")" - ] - }, - { - "cell_type": "markdown", - "id": "ecfc71e6", - "metadata": {}, - "source": [ - "#### Find all commit authors names." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6153035a", - "metadata": {}, - "outputs": [], - "source": [ - "authors = re.findall(r\"\", git_log_output)\n", - "authors[0]" - ] - }, - { - "cell_type": "markdown", - "id": "3fa201fb", - "metadata": {}, - "source": [ - "#### `git log` from projects repo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e200a8b0", - "metadata": {}, - "outputs": [], - "source": [ - "git_log_output = str(check_output([\"git\", \"log\"], cwd=\"../../projects-and-labs\"), encoding=\"utf-8\")\n", - "print(git_log_output[:1000])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "053b2607", - "metadata": {}, - "outputs": [], - "source": [ - "re.findall(r\"\", git_log_output)" - ] - }, - { - "cell_type": "markdown", - "id": "3ce53c79", - "metadata": {}, - "source": [ - "### Emails example" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1968c0ff", - "metadata": {}, - "outputs": [], - "source": [ - "s = \"\"\"\n", - "Gurmail [Instructor] - gsingh58(AT) cs.wisc.edu\n", - "Jinlang [Head TA] - jwang2775 (AT) wisc.edu\n", - "Elliot [TA] - eepickens (AT) cs.wisc.edu\n", - "Alex [TA] - aclinton (AT) wisc.edu\n", - "Bowman [TA] - bnbrown3 (AT) wisc.edu\n", - "Hafeez [TA] - aneesali (AT) wisc.edu\n", - "William [TA] - wycong (AT) wisc.edu\n", - "\"\"\"\n", - "print(s)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5fbfdf12", - "metadata": {}, - "outputs": [], - "source": [ - "name = r\"\\w+\"\n", - "at = r\"@|([\\(\\[]?[Aa][Tt][\\)\\]]?)\"\n", - "domain = r\"\\w+\\.(\\w+\\.)?(edu|com|org|net|io|gov)\"\n", - "\n", - "full_regex = f\"(({name})\\s*({at})\\s*({domain}))\"\n", - "\n", - "re.findall(full_regex, s)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2257dbf1", - "metadata": {}, - "outputs": [], - "source": [ - "print(\"REGEX:\", full_regex)\n", - "for match in re.findall(full_regex, s):\n", - " print(match[1] + \"@\" + match[4])" - ] - }, - { - "cell_type": "markdown", - "id": "16c6c169", - "metadata": {}, - "source": [ - "### Self-practice\n", - "\n", - "Q1: Which regex will NOT match \"123\"\n", - "1. r\"\\d\\d\\d\"\n", - "2. r\"\\d{3}\"\n", - "3. r\"\\D\\D\\D\"\n", - "4. r\"...\"\n", - "\n", - "Q2: What will r\"^A\" match?\n", - "1. \"A\"\n", - "2. \"^A\"\n", - "3. \"BA\"\n", - "4. \"B\"\n", - "5. \"BB\"\n", - "\n", - "Q3: Which one can match \"HH\"?\n", - "1. r\"HA+H\"\n", - "2. r\"HA+?H\"\n", - "3. r\"H(A+)?H\"\n", - "\n", - "Q4: Which string(s) will match r\"^(ha)*$\"\n", - "1. \"\"\n", - "2. \"hahah\"\n", - "3. \"that\"\n", - "4. \"HAHA\"\n", - "\n", - "Q5: What is the type of the following?re.findall(r\"(\\d) (\\w+)\", some_str)[0]\n", - "1. list\n", - "2. tuple\n", - "3. string\n", - "\n", - "Q6: What will it do?\n", - "```python\n", - "re.sub(r\"(\\d{3})-(\\d{3}-\\d{4})\",\n", - " r\"(\\g<1>) \\g<2>\",\n", - " \"608-123-4567\")\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "f1184ba1", - "metadata": {}, - "source": [ - "The answers of these questions can be found in self_practice.ipynb. You may want to try to answer these questions yourself and then verify your answers." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/lecture_material/15-regex_2/regex_2_lec_002.ipynb b/lecture_material/15-regex_2/regex_2_lec_002.ipynb deleted file mode 100644 index db06eb7fa32b923d38c746a1408a37396c9ee014..0000000000000000000000000000000000000000 --- a/lecture_material/15-regex_2/regex_2_lec_002.ipynb +++ /dev/null @@ -1,835 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "e60c1c48", - "metadata": {}, - "source": [ - "# Regex 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0dba68b0", - "metadata": {}, - "outputs": [], - "source": [ - "#import statements\n", - "import re\n", - "from subprocess import check_output\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b97c8008-8e39-4a9f-89a1-0d1ddbb1ac01", - "metadata": {}, - "outputs": [], - "source": [ - "# Example strings\n", - "# from DS100 book...\n", - "def reg(regex, text):\n", - " \"\"\"\n", - " Prints the string with the regex match highlighted.\n", - " \"\"\"\n", - " print(re.sub(f'({regex})', r'\\033[1;30;43m\\1\\033[m', text))\n", - "s1 = \" \".join([\"A DAG is a directed graph without cycles.\",\n", - " \"A tree is a DAG where every node has one parent (except the root, which has none).\",\n", - " \"To learn more, visit www.example.com or call 1-608-123-4567. :) ¯\\_(ツ)_/¯\"])\n", - "print(s1)\n", - "\n", - "s2 = \"\"\"1-608-123-4567\n", - "a-bcd-efg-hijg (not a phone number)\n", - "1-608-123-456 (not a phone number)\n", - "608-123-4567\n", - "123-4567\n", - "1-123-4567 (not a phone number)\n", - "\"\"\"\n", - "print(s2)\n", - "\n", - "s3 = \"In CS 320, there are 11 quizzes, 6 projects, 28 lectures, and 1000 things to learn. CS 320 is awesome!\"\n", - "print(s3)\n", - "\n", - "s4 = \"\"\"In CS 320, there are 11 quizzes, 6 projects,\n", - "28 lectures, and 1000 things to learn. CS 320 is awesome!\"\"\"\n", - "print(s4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "924069c5-82be-4423-b659-2beee8e226be", - "metadata": {}, - "outputs": [], - "source": [ - "print(s1)" - ] - }, - { - "cell_type": "markdown", - "id": "6a2eff34", - "metadata": {}, - "source": [ - "### Regex is case sensitive\n", - "\n", - "### Character classes\n", - "\n", - "- Character classes can be mentioned within `[...]`\n", - "- `^` means `NOT` of a character class\n", - "- `-` enables us to mention range of characters, for example `[A-Z]`\n", - "- `|` enables us to perform `OR`\n", - "\n", - "### Metacharacters\n", - "\n", - "- predefined character classes\n", - " - `\\d` => digits\n", - " - `\\s` => whitespace (space, tab, newline)\n", - " - `\\w` => \"word\" characters (digits, letters, underscores, etc) --- helpful for variable name matches and whole word matches (as it doesn't match whitespace --- `\\s`)\n", - " - `.` => wildcard: anything except newline\n", - "- capitalized version of character classes mean `NOT`, for example `\\D` => everything except digits\n", - "\n", - "### REPETITION\n", - "\n", - "- `<character>{<num matches>}` - for example: `w{3}`\n", - "- matches cannot overlap\n", - "\n", - "### Variable length repitition operators\n", - "\n", - "- `*` => 0 or more (greedy: match as many characters as possible)\n", - "- `+` => 1 or more (greedy: match as many characters as possible)\n", - "- `?` => 0 or 1\n", - "- `*?` => 0 or more (non-greedy: match as few characters as possible)\n", - "- `+?` => 1 or more (non-greedy: match as few characters as possible)\n", - "\n", - "#### Find everything inside of parentheses." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11f75d92-f215-47f4-8804-5e5727d3be55", - "metadata": {}, - "outputs": [], - "source": [ - "# this doesn't work\n", - "# it captures everything because () have special meaning (coming up)\n", - "reg(r\"\", s1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b488460e", - "metadata": {}, - "outputs": [], - "source": [ - "# How can we change this to not use special meaning of ()?\n", - "# * is greedy: match as many characters as possible\n", - "reg(r\"(.*)\", s1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "42155b54-d27b-4418-81a1-c005c508d738", - "metadata": {}, - "outputs": [], - "source": [ - "# non-greedy: stop at the first possible spot instead of the last possible spot\n", - "reg(r\"\\(.*\\)\", s1)" - ] - }, - { - "cell_type": "markdown", - "id": "0fd70cfd", - "metadata": {}, - "source": [ - "### Anchor characters\n", - "- `^` => start of string\n", - " - `^` is overloaded --- what was the other usage?\n", - "- `$` => end of string\n", - "\n", - "#### Find everything in the first sentence." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "40fed5db", - "metadata": {}, - "outputs": [], - "source": [ - "# doesn't work because remember regex finds all possible matches\n", - "# so it matches every single sentence \n", - "# (even though we are doing non-greedy match)\n", - "reg(r\"\", s1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e97abd8", - "metadata": {}, - "outputs": [], - "source": [ - "reg(r\".*?\\.\", s1)" - ] - }, - { - "cell_type": "markdown", - "id": "f66a4651", - "metadata": {}, - "source": [ - "#### Find everything in the first two sentences." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cfa8353e-4402-4f64-b3f8-35e73b2d7a68", - "metadata": {}, - "outputs": [], - "source": [ - "reg(r\"\", s1)" - ] - }, - { - "cell_type": "markdown", - "id": "76570acd", - "metadata": {}, - "source": [ - "#### Find last \"word\" in the sentence." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f35a6c81-47b5-48eb-8357-888fe832f85b", - "metadata": {}, - "outputs": [], - "source": [ - "reg(r\"\", s1)" - ] - }, - { - "cell_type": "markdown", - "id": "4b25fb66", - "metadata": {}, - "source": [ - "### Case study: find all phone numbers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ecbeaf0", - "metadata": {}, - "outputs": [], - "source": [ - "print(s2)\n", - "# The country code (1) in the front is optional\n", - "# The area code (608) is also optional\n", - "# Doesn't make sense to match country code without area code though!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88a01725-790c-4f21-b334-e3d31bed24b5", - "metadata": {}, - "outputs": [], - "source": [ - "# Full US phone numbers\n", - "reg(r\"\", s2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea8ce0be-98f8-4f9c-bcc0-1d8a0bb183a8", - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# The country code (1) in the front is optional\n", - "reg(r\"\", s2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5befb23a-848f-44d4-aaac-6b21ac0bbf43", - "metadata": {}, - "outputs": [], - "source": [ - "# The area code (608) is also optional\n", - "# Doesn't make sense to have country code without area code though!\n", - "reg(r\"\", s2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "34164fd1-a422-45a0-8e11-54199ca77120", - "metadata": {}, - "outputs": [], - "source": [ - "# This is good enough for 320 quizzes/tests\n", - "# But clearly, the last match is not correct\n", - "reg(r\"\", s2)" - ] - }, - { - "cell_type": "markdown", - "id": "8a2ee4e2", - "metadata": {}, - "source": [ - "Regex documentation link: https://docs.python.org/3/library/re.html." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "694a585b-b5a7-4a6f-a0f1-60521f7dfc47", - "metadata": {}, - "outputs": [], - "source": [ - "# BONUS: negative lookbehind (I won't test this)\n", - "reg(r\"(?<!\\d\\-)((\\d-)?\\d{3}-)?\\d{3}-\\d{4}\", s2)" - ] - }, - { - "cell_type": "markdown", - "id": "3973350b", - "metadata": {}, - "source": [ - "There is also a negative lookahead. For example, how to avoid matching \"1-608-123-456\" in \"1-608-123-4569999\". You can explore this if you are interested." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4988d765", - "metadata": {}, - "outputs": [], - "source": [ - "reg(r\"(?<!\\d\\-)((\\d-)?\\d{3}-)?\\d{3}-\\d{4}\", \"608-123-4569999\")" - ] - }, - { - "cell_type": "markdown", - "id": "b02ae9e0", - "metadata": {}, - "source": [ - "### Testing your regex\n", - "- you could use `reg(...)` function\n", - "- another useful resource: https://regex101.com/" - ] - }, - { - "cell_type": "markdown", - "id": "4a973271", - "metadata": {}, - "source": [ - "### `re` module\n", - "- `re.findall(<PATTERN>, <SEARCH STRING>)`: regular expression matches\n", - " - returns a list of strings \n", - "- `re.sub(<PATTERN>, <REPLACEMENT>, <SEARCH STRING>)`: regular expression match + substitution\n", - " - returns a new string with the substitutions (remember strings are immutable)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "73ec525f", - "metadata": {}, - "outputs": [], - "source": [ - "msg = \"In CS 320,\\tthere are 28 lectures, 11 quizzes, 3 exams,\\t6 projects, and 1000 things to learn. CS 320 is awesome!\"\n", - "print(msg)" - ] - }, - { - "cell_type": "markdown", - "id": "34998a5e", - "metadata": {}, - "source": [ - "#### Find all digits." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f42c25a", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "70b2f488", - "metadata": {}, - "source": [ - "### Groups\n", - "- we can capture matches using `()` => this is the special meaning of `()`\n", - "- returns a list of tuples, where length of the tuple will be number of groups\n", - "\n", - "#### Find all digits and the word that comes after that." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5309adee", - "metadata": {}, - "outputs": [], - "source": [ - "matches = re.findall(r\"\", msg)\n", - "matches" - ] - }, - { - "cell_type": "markdown", - "id": "bc6a982c", - "metadata": {}, - "source": [ - "#### Goal: make a dict (course component => count, like \"projects\" => 6)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7f1a028", - "metadata": {}, - "outputs": [], - "source": [ - "course_dict = {}\n", - "for count, component in matches:\n", - " course_dict[component] = int(count)\n", - "course_dict" - ] - }, - { - "cell_type": "markdown", - "id": "c4b6b505", - "metadata": {}, - "source": [ - "### Unlike matches, groups can overlap\n", - "\n", - "#### Find and group all digits and the word that comes after that." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "491c3460", - "metadata": {}, - "outputs": [], - "source": [ - "re.findall(r\"(\\d+) (\\w+)\", msg)" - ] - }, - { - "cell_type": "markdown", - "id": "d2227e69", - "metadata": {}, - "source": [ - "#### Substitute all digits with \"###\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d1fede1", - "metadata": {}, - "outputs": [], - "source": [ - "re.sub(r\"\", , msg)" - ] - }, - { - "cell_type": "markdown", - "id": "9d531122", - "metadata": {}, - "source": [ - "#### Goal: normalize whitespace (everything will be a single space)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4becbe70", - "metadata": {}, - "outputs": [], - "source": [ - "print(msg)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "72a6eb42", - "metadata": {}, - "outputs": [], - "source": [ - "re.sub(r\"\", , msg)" - ] - }, - { - "cell_type": "markdown", - "id": "6faf33fd", - "metadata": {}, - "source": [ - "### How to use groups is substitution?\n", - "- `\\g<N>` gives you the result of the N'th grouping.\n", - "\n", - "#### Substitute all course component counts with HTML bold tags." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8df577fd", - "metadata": {}, - "outputs": [], - "source": [ - "print(re.sub(r\"(\\d+)\", \"<b></b>\", msg))" - ] - }, - { - "cell_type": "markdown", - "id": "35a15a41", - "metadata": {}, - "source": [ - "In CS <b>320</b>, there are <b>28</b> lectures, <b>11</b> quizzes, <b>3</b> exams, <b>6</b> projects, and <b>1000</b> things to learn. CS <b>320</b> is awesome!" - ] - }, - { - "cell_type": "markdown", - "id": "6b299526", - "metadata": {}, - "source": [ - "### Git log example" - ] - }, - { - "cell_type": "markdown", - "id": "a9b7261c", - "metadata": {}, - "source": [ - "#### Run `git log` as a shell command" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10e459b4", - "metadata": {}, - "outputs": [], - "source": [ - "!git log" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ef440fed", - "metadata": {}, - "outputs": [], - "source": [ - "git_log_output = str(check_output([\"git\", \"log\"]), encoding=\"utf-8\")\n", - "print(git_log_output[:500])" - ] - }, - { - "cell_type": "markdown", - "id": "5c154b46", - "metadata": {}, - "source": [ - "#### GOAL: find all the commit numbers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ea954bf", - "metadata": {}, - "outputs": [], - "source": [ - "commits = re.findall(r\"\", git_log_output)\n", - "# recent 10 commit numbers\n", - "commits[:10]" - ] - }, - { - "cell_type": "markdown", - "id": "bc485b5f", - "metadata": {}, - "source": [ - "#### What days of the week does the team push things into this repo?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "57353c44", - "metadata": {}, - "outputs": [], - "source": [ - "print(git_log_output[:500])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d1ea6f59", - "metadata": {}, - "outputs": [], - "source": [ - "days = re.findall(r\"\", git_log_output)\n", - "days" - ] - }, - { - "cell_type": "markdown", - "id": "2c7efb55", - "metadata": {}, - "source": [ - "#### Count unique days" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3c2d7207", - "metadata": {}, - "outputs": [], - "source": [ - "day_counts = pd.Series(days).value_counts()\n", - "day_counts" - ] - }, - { - "cell_type": "markdown", - "id": "7317ca35", - "metadata": {}, - "source": [ - "#### Sort by day of the week" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d5c5c58c", - "metadata": {}, - "outputs": [], - "source": [ - "sorted_day_counts = day_counts.loc[[\"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sun\"]]\n", - "sorted_day_counts" - ] - }, - { - "cell_type": "markdown", - "id": "745afee5", - "metadata": {}, - "source": [ - "#### Create a bar plot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7bb8f6f", - "metadata": {}, - "outputs": [], - "source": [ - "ax = sorted_day_counts.plot.bar()\n", - "ax.set_ylabel(\"Commit counts\")\n", - "ax.set_xlabel(\"Days of the week\")" - ] - }, - { - "cell_type": "markdown", - "id": "ecfc71e6", - "metadata": {}, - "source": [ - "#### Find all commit authors names." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6153035a", - "metadata": {}, - "outputs": [], - "source": [ - "authors = re.findall(r\"\", git_log_output)\n", - "authors[0]" - ] - }, - { - "cell_type": "markdown", - "id": "3fa201fb", - "metadata": {}, - "source": [ - "#### `git log` from projects repo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e200a8b0", - "metadata": {}, - "outputs": [], - "source": [ - "git_log_output = str(check_output([\"git\", \"log\"], cwd=\"../../projects-and-labs\"), encoding=\"utf-8\")\n", - "print(git_log_output[:1000])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "053b2607", - "metadata": {}, - "outputs": [], - "source": [ - "re.findall(r\"\", git_log_output)" - ] - }, - { - "cell_type": "markdown", - "id": "3ce53c79", - "metadata": {}, - "source": [ - "### Emails example" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1968c0ff", - "metadata": {}, - "outputs": [], - "source": [ - "s = \"\"\"\n", - "Gurmail [Instructor] - gsingh58(AT) cs.wisc.edu\n", - "Jinlang [Head TA] - jwang2775 (AT) wisc.edu\n", - "Elliot [TA] - eepickens (AT) cs.wisc.edu\n", - "Alex [TA] - aclinton (AT) wisc.edu\n", - "Bowman [TA] - bnbrown3 (AT) wisc.edu\n", - "Hafeez [TA] - aneesali (AT) wisc.edu\n", - "William [TA] - wycong (AT) wisc.edu\n", - "\"\"\"\n", - "print(s)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5fbfdf12", - "metadata": {}, - "outputs": [], - "source": [ - "name = r\"\\w+\"\n", - "at = r\"@|([\\(\\[]?[Aa][Tt][\\)\\]]?)\"\n", - "domain = r\"\\w+\\.(\\w+\\.)?(edu|com|org|net|io|gov)\"\n", - "\n", - "full_regex = f\"(({name})\\s*({at})\\s*({domain}))\"\n", - "\n", - "re.findall(full_regex, s)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2257dbf1", - "metadata": {}, - "outputs": [], - "source": [ - "print(\"REGEX:\", full_regex)\n", - "for match in re.findall(full_regex, s):\n", - " print(match[1] + \"@\" + match[4])" - ] - }, - { - "cell_type": "markdown", - "id": "16c6c169", - "metadata": {}, - "source": [ - "### Self-practice\n", - "\n", - "Q1: Which regex will NOT match \"123\"\n", - "1. r\"\\d\\d\\d\"\n", - "2. r\"\\d{3}\"\n", - "3. r\"\\D\\D\\D\"\n", - "4. r\"...\"\n", - "\n", - "Q2: What will r\"^A\" match?\n", - "1. \"A\"\n", - "2. \"^A\"\n", - "3. \"BA\"\n", - "4. \"B\"\n", - "5. \"BB\"\n", - "\n", - "Q3: Which one can match \"HH\"?\n", - "1. r\"HA+H\"\n", - "2. r\"HA+?H\"\n", - "3. r\"H(A+)?H\"\n", - "\n", - "Q4: Which string(s) will match r\"^(ha)*$\"\n", - "1. \"\"\n", - "2. \"hahah\"\n", - "3. \"that\"\n", - "4. \"HAHA\"\n", - "\n", - "Q5: What is the type of the following?re.findall(r\"(\\d) (\\w+)\", some_str)[0]\n", - "1. list\n", - "2. tuple\n", - "3. string\n", - "\n", - "Q6: What will it do?\n", - "```python\n", - "re.sub(r\"(\\d{3})-(\\d{3}-\\d{4})\",\n", - " r\"(\\g<1>) \\g<2>\",\n", - " \"608-123-4567\")\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "f1184ba1", - "metadata": {}, - "source": [ - "The answers of these questions can be found in self_practice.ipynb. You may want to try to answer these questions yourself and then verify your answers." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}