mn + archive scripts

buddhist-uni · Jun 22, 2023 · cc76b30 · cc76b30
1 parent 7e4d140
commit cc76b30
Show file tree

Hide file tree

Showing 7 changed files with 160 additions and 21 deletions.
diff --git a/.github/workflows/archive.yml b/.github/workflows/archive.yml
@@ -0,0 +1,21 @@
+name: Archive.org Saver
+on:
+    workflow_dispatch:
+    schedule:
+        - cron: "40 3 15 5,11 *"
+jobs:
+  Archive:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout the Code
+        uses: actions/checkout@v3
+        with:
+          ref: main
+      - name: Install Dependencies
+        run: |
+          cd ~
+          printf "${{ secrets.ARCHIVE_ORG_AUTH }}" > archive.org.auth
+          pip install tqdm
+      - name: Run the Site Archiver
+        run: |
+          python scripts/archive_site.py
diff --git a/_content/av/caring-for-glaciers_gagne.md b/_content/av/caring-for-glaciers_gagne.md
@@ -1,5 +1,5 @@
 ---
-title: "Caring for Glaciers"
+title: "Caring for the Land in Ladakh"
 authors:
   - "Karine Gagné"
 drive_links:
@@ -12,14 +12,14 @@ course: nature
 year: 2019
 month: nov
 tags:
+  - present
   - agriculture
   - pastoralism
-  - landscape
-  - sustainability
+  - inner-asia
   - himalayas
-  - climate-change
 minutes: 6
 ---
 
-In this short clip, Karine Gagné talks about the changing landscape of Ladakh mainly due to the various military exchanges and climate fluctuations.
-All of these changes have made it difficult for the local people, farmers and herders, to thrive.
+> Whatever the hardship, one should never abandon their land or their animals.
+
+On how the modernization of the Kashmiri economy is experienced as a _moral_ disruption by the agropastoralists of Ladakh.
diff --git a/_content/canon/mn126.md b/_content/canon/mn126.md
@@ -0,0 +1,19 @@
+---
+title: "MN 126 Bhūmija Sutta: With Bhūmija"
+translator: sujato
+slug: "mn126"
+external_url: "https://suttacentral.net/mn126/en/sujato"
+drive_links:
+  - "https://drive.google.com/file/d/1-wtKLyWIOCRvOPFYMJ2L4VT_NK-Fnsvv/view?usp=drivesdk"
+course: thought
+tags:
+  - imagery
+  - path
+  - mn
+year: 2018
+pages: 4
+---
+
+> heaping sand in a bucket, sprinkling it thoroughly with water, and pressing it out. But by doing this, they couldn’t extract any oil, regardless of whether they made a wish
+
+It's not wishing for *nibbāna* that leads there, but rather putting in the intelligent effort required to walk the path.
diff --git a/_data/drive_folders.json b/_data/drive_folders.json
@@ -1,4 +1,8 @@
 {
+ "": {
+  "private": null,
+  "public": "https://drive.google.com/drive/folders/1Ih3PRUKLHaWzVvoVVkCRuaCzbsjreQXa"
+ },
  "aging": {
   "private": "https://drive.google.com/drive/folders/18aHM6fThA1zeLMunS7MIsa6UbjM-6hPI",
   "public": "https://drive.google.com/drive/folders/1d9FVRx81URWYJSqcFHc1NTROsvVhIn__"

diff --git a/scripts/archivable_urls/extracturls.py b/scripts/archivable_urls/extracturls.py
@@ -0,0 +1,21 @@
+import re
+
+# download the latest lychee output from GitHub
+input_file = "lycheeout.txt"
+output_file = "urls.txt"
+
+# Regular expression pattern to match the desired URLs
+pattern = r"✔ \[200\] (https?://\S+)"
+
+# Open the input and output files
+with open(input_file, "r") as f_in, open(output_file, "w") as f_out:
+    # Read each line from the input file
+    for line in f_in:
+        # Find the URLs matching the pattern
+        match = re.search(pattern, line)
+        if match:
+            url = match.group(1)
+            # Write the URL to the output file
+            f_out.write(url + "\n")
+
+print("URLs extracted and saved to 'urls.txt'.")
diff --git a/scripts/archivable_urls/filterurls.py b/scripts/archivable_urls/filterurls.py
@@ -0,0 +1,34 @@
+import re
+
+input_file = "urls.txt"
+output_file = "filteredurls.txt"
+
+# Regular expression patterns
+exclude_pattern = r"https?://(web\.)?archive\.org"
+include_pattern = r"(https?://(?!.*archive\.org)\S*?(\.html?|\.mp3|\.pdf)|https?://\S*?/download\S*)"
+
+# Set to store unique URLs
+unique_urls = set()
+
+# Open the input file
+with open(input_file, "r") as f_in:
+    # Read each line from the input file
+    for line in f_in:
+        # Exclude URLs matching the exclude pattern
+        if re.search(exclude_pattern, line):
+            continue
+
+        # Find the URLs matching the include pattern
+        match = re.search(include_pattern, line)
+        if match:
+            url = match.group(0)
+            # Add the URL to the set
+            unique_urls.add(url)
+
+# Open the output file
+with open(output_file, "w") as f_out:
+    # Write the unique URLs to the output file
+    for url in unique_urls:
+        f_out.write(url + "\n")
+
+print("Filtered URLs (with duplicates removed) extracted and saved to 'filteredurls.txt'.")
diff --git a/scripts/archive_site.py b/scripts/archive_site.py
@@ -1,11 +1,19 @@
 """Saves every page across the site to Archive.org's Wayback Machine"""
 
 import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
 from pathlib import Path
 import json
 import time
+from datetime import datetime, timedelta
 import os
 import xml.etree.ElementTree as XML
+try:
+  from tqdm import tqdm, trange
+except:
+  print("  pip install tqdm")
+  quit(1)
 ARCHIVE_ORG_AUTH_FILE = '~/archive.org.auth'
 
 ARCHIVE_ORG_AUTH_PATH = Path(os.path.expanduser(ARCHIVE_ORG_AUTH_FILE))
@@ -17,6 +25,14 @@
 
 SITEMAP_NAMESPACE = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"}
 
+retry_strategy = Retry(total=2, backoff_factor=0.5)
+http_adapter = HTTPAdapter(max_retries=retry_strategy)
+archive_org_session = requests.Session()
+archive_org_session.mount("https://", http_adapter)
+archive_org_session.timeout = 5
+archive_org_session.headers['Authorization'] = ARCHIVE_ORG_AUTH
+archive_org_session.headers['Accept'] = 'application/json'
+
 def all_urls_in_website(domain):
     # Fetch the XML sitemap using requests
     response = requests.get(domain+"/sitemap.xml")
@@ -28,17 +44,25 @@ def all_urls_in_website(domain):
     # Find all URL elements using XPath
     url_elements = root.findall(".//ns:url", SITEMAP_NAMESPACE)
     togo = len(url_elements)
-    print(f"Found {togo} urls. Will take at least {(togo/600.0):.2f} hours")
     # Extract and yield the URLs
     for url_element in url_elements:
         loc_element = url_element.find("ns:loc", SITEMAP_NAMESPACE)
         if loc_element is not None:
             yield loc_element.text
 
+def last_archived_datetime(url):
+  resp = archive_org_session.head("https://web.archive.org/web/"+str(url))
+  if not resp.ok:
+    return None
+  if not 'x-archive-redirect-reason' in resp.headers:
+    return None
+  timestamp = resp.headers['x-archive-redirect-reason'].split(' at ')[1]
+  return datetime.strptime(timestamp, '%Y%m%d%H%M%S')
+
 def save_url_to_archiveorg(url):
   print(f"Saving {url} to the Wayback Machine now...")
   try:
-    resp = requests.post("https://web.archive.org/save", data={"url": url}, headers={'Accept': 'application/json', 'Authorization': ARCHIVE_ORG_AUTH})
+    resp = archive_org_session.post("https://web.archive.org/save", data={"url": url})
   except:
     print("WARNING: A connection error occurred")
     return False
@@ -49,35 +73,51 @@ def save_url_to_archiveorg(url):
     print(f"WARNING: Save failed\n\t{resp.headers}\n\tCONTENT:\n\t{resp.text}")
     return False
 
-if __name__ == "__main__":
-  try:
-    from tqdm import tqdm, trange
-  except:
-    print("  pip install tqdm")
-    quit(1)
-  skip_past = "Last successful URL"
-  urls = list(all_urls_in_website("https://buddhistuniversity.net"))
+def archive_urls(urls, skip_urls_archived_in_last_days=365):
+  successes = 0
   def wait_secs(n):
     print(f"Waiting {n} seconds...")
     for i in trange(n):
       time.sleep(1)
-  try:
-    skip_past = urls.index(skip_past)
-    urls = urls[skip_past+1:]
-  except:
-    pass
+  if skip_urls_archived_in_last_days:
+    now = datetime.now()
+    skipinterval = timedelta(days=skip_urls_archived_in_last_days)
+    def should_arch(url):
+      archtime = last_archived_datetime(url)
+      if not archtime:
+        return True
+      return now-archtime > skipinterval
+  else:
+    def should_arch(url):
+      return True
   consecutive_failures = 0
   for url in tqdm(urls):
+    if not should_arch(url):
+      print(f"Skipping {url}...")
+      continue
     if not save_url_to_archiveorg(url):
       consecutive_failures += 1
       wait_secs(60)
       if save_url_to_archiveorg(url):
+        successes += 1
         consecutive_failures = 0
       else:
         consecutive_failures += 1
     else:
+      successes += 1
       consecutive_failures = 0
     if consecutive_failures > 5:
       print("ERROR: This doesn't seem to be working...")
       quit(1)
     wait_secs(5)
+  return successes
+
+if __name__ == "__main__":
+  skip_past = "Last successful URL"
+  urls = list(all_urls_in_website("https://buddhistuniversity.net"))
+  try:
+    skip_past = urls.index(skip_past)
+    urls = urls[skip_past+1:]
+  except:
+    pass
+  archive_urls(urls)