From 4371defb8006c9513da9a1da058a591ca8676c40 Mon Sep 17 00:00:00 2001 From: 100stacks <100stacks@users.noreply.github.com> Date: Sat, 8 Jun 2024 17:56:16 -0500 Subject: [PATCH 1/5] app: simple cli web scraper --- .gitignore | 3 +++ app/scrape.py | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 app/scrape.py diff --git a/.gitignore b/.gitignore index 82f9275..afd770b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,9 @@ __pycache__/ *.py[cod] *$py.class +# OS files +*/*.DS_Store + # C extensions *.so diff --git a/app/scrape.py b/app/scrape.py new file mode 100644 index 0000000..e6cb59f --- /dev/null +++ b/app/scrape.py @@ -0,0 +1,18 @@ +import re +import sys +import urllib.request + +def get_links(url): + response = urllib.request.urlopen(url) + html = response.read().decode("utf-8") + links = [] + + for match in re.finditer('href="(.*?)"', html): + links.append(match.group(1)) + + return links + +if __name__ == "__main__": + links = get_links(sys.argv[1]) + + print(links) From 8dced2f334af0da5a9eb9d798fcc583a46bcb178 Mon Sep 17 00:00:00 2001 From: 100stacks <100stacks@users.noreply.github.com> Date: Sat, 8 Jun 2024 18:30:28 -0500 Subject: [PATCH 2/5] modal: implement initial modal config | test --- app/scrape.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/app/scrape.py b/app/scrape.py index e6cb59f..fe7de18 100644 --- a/app/scrape.py +++ b/app/scrape.py @@ -2,6 +2,11 @@ import sys import urllib.request +import modal # host code on Modal serverless platform + +app = modal.App(name="weblink-scraper") + +@app.function() def get_links(url): response = urllib.request.urlopen(url) html = response.read().decode("utf-8") @@ -12,7 +17,8 @@ def get_links(url): return links -if __name__ == "__main__": - links = get_links(sys.argv[1]) +@app.local_entrypoint() +def main(url): + links = get_links.remote(url) print(links) From 983133f0b21d10442dbf20f341bebe0fde374f33 Mon Sep 17 00:00:00 2001 From: 100stacks <100stacks@users.noreply.github.com> Date: Sat, 8 Jun 2024 19:07:30 -0500 Subject: [PATCH 3/5] modal: add Playwright custom container to handle dynamic content --- app/scrape.py | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/app/scrape.py b/app/scrape.py index fe7de18..2d89ddc 100644 --- a/app/scrape.py +++ b/app/scrape.py @@ -6,14 +6,39 @@ app = modal.App(name="weblink-scraper") -@app.function() -def get_links(url): - response = urllib.request.urlopen(url) - html = response.read().decode("utf-8") - links = [] - - for match in re.finditer('href="(.*?)"', html): - links.append(match.group(1)) +""" + Custom containers + + Playwright - used to launch a headless Chromium browser. Interprets any JS on a webpage. + modal.Image - pre-bundled image - `modal.Image.debian_slim` + +""" +playwright_image = modal.Image.debian_slim(python_version="3.10").run_commands( + "apt-get update", + "apt-get install -y software-properties-common", + "apt-add-repository non-free", + "apt-add-repository contrib", + "pip install playwright==1.30.0", + "playwright install-deps chromium", + "playwright install chromium", +) + +@app.function(image=playwright_image) +async def get_links(cur_url: str): + from playwright.async_api import async_playwright + + async with async_playwright() as p: + browser = await p.chromium.launch() + page = await browser.new_page() + + # scrape links on webpage + await page.goto(cur_url) + links = await page.eval_on_selector_all("a[href]", "elements => elements.map(element => element.href)") + + # close session + await browser.close() + + print("Links", links) return links From 6edc778fb64b9c5b4b7169fde613c9ab22d8faa2 Mon Sep 17 00:00:00 2001 From: 100stacks <100stacks@users.noreply.github.com> Date: Sat, 8 Jun 2024 19:45:39 -0500 Subject: [PATCH 4/5] modal: use map to scale out and scrape sites in parallel --- app/scrape.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/app/scrape.py b/app/scrape.py index 2d89ddc..12792b4 100644 --- a/app/scrape.py +++ b/app/scrape.py @@ -42,8 +42,15 @@ async def get_links(cur_url: str): return links +""" + Scaling out + + Update our script to fetch a large list of links in parallel. +""" @app.local_entrypoint() -def main(url): - links = get_links.remote(url) +def main(): + urls = ["https://modal.com", "https://github.com"] - print(links) + for links in get_links.map(urls): + for link in links: + print(links) From 57e1ad41b251cfe7b2f468eb35d869ddfb2ad54b Mon Sep 17 00:00:00 2001 From: 100stacks <100stacks@users.noreply.github.com> Date: Sat, 8 Jun 2024 21:01:17 -0500 Subject: [PATCH 5/5] modal: deploy Modal add | scrape links every 5 days --- app/scrape.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/app/scrape.py b/app/scrape.py index 12792b4..4d8bdd7 100644 --- a/app/scrape.py +++ b/app/scrape.py @@ -42,6 +42,21 @@ async def get_links(cur_url: str): return links +""" + Deploy App: Schedule App Deployments + + Simulate a list of websites to crawl. In a more realistic architecture, + this would be a dynamically generated list. +""" +@app.function(schedule=modal.Period(days=5)) +def daily_scrape(): + urls = ["https://modal.com", "https://github.com", "https://www.ai.engineer/worldsfair/2024/schedule"] + + for links in get_links.map(urls): + for link in links: + print(links) + + """ Scaling out