Skip to content

Commit

Permalink
mn + archive scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Khemarato Bhikkhu committed Jun 22, 2023
1 parent 7e4d140 commit cc76b30
Show file tree
Hide file tree
Showing 7 changed files with 160 additions and 21 deletions.
21 changes: 21 additions & 0 deletions .github/workflows/archive.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: Archive.org Saver
on:
workflow_dispatch:
schedule:
- cron: "40 3 15 5,11 *"
jobs:
Archive:
runs-on: ubuntu-latest
steps:
- name: Checkout the Code
uses: actions/checkout@v3
with:
ref: main
- name: Install Dependencies
run: |
cd ~
printf "${{ secrets.ARCHIVE_ORG_AUTH }}" > archive.org.auth
pip install tqdm
- name: Run the Site Archiver
run: |
python scripts/archive_site.py
12 changes: 6 additions & 6 deletions _content/av/caring-for-glaciers_gagne.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
title: "Caring for Glaciers"
title: "Caring for the Land in Ladakh"
authors:
- "Karine Gagné"
drive_links:
Expand All @@ -12,14 +12,14 @@ course: nature
year: 2019
month: nov
tags:
- present
- agriculture
- pastoralism
- landscape
- sustainability
- inner-asia
- himalayas
- climate-change
minutes: 6
---

In this short clip, Karine Gagné talks about the changing landscape of Ladakh mainly due to the various military exchanges and climate fluctuations.
All of these changes have made it difficult for the local people, farmers and herders, to thrive.
> Whatever the hardship, one should never abandon their land or their animals.
On how the modernization of the Kashmiri economy is experienced as a _moral_ disruption by the agropastoralists of Ladakh.
19 changes: 19 additions & 0 deletions _content/canon/mn126.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
title: "MN 126 Bhūmija Sutta: With Bhūmija"
translator: sujato
slug: "mn126"
external_url: "https://suttacentral.net/mn126/en/sujato"
drive_links:
- "https://drive.google.com/file/d/1-wtKLyWIOCRvOPFYMJ2L4VT_NK-Fnsvv/view?usp=drivesdk"
course: thought
tags:
- imagery
- path
- mn
year: 2018
pages: 4
---

> heaping sand in a bucket, sprinkling it thoroughly with water, and pressing it out. But by doing this, they couldn’t extract any oil, regardless of whether they made a wish
It's not wishing for *nibbāna* that leads there, but rather putting in the intelligent effort required to walk the path.
4 changes: 4 additions & 0 deletions _data/drive_folders.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
{
"": {
"private": null,
"public": "https://drive.google.com/drive/folders/1Ih3PRUKLHaWzVvoVVkCRuaCzbsjreQXa"
},
"aging": {
"private": "https://drive.google.com/drive/folders/18aHM6fThA1zeLMunS7MIsa6UbjM-6hPI",
"public": "https://drive.google.com/drive/folders/1d9FVRx81URWYJSqcFHc1NTROsvVhIn__"
Expand Down
21 changes: 21 additions & 0 deletions scripts/archivable_urls/extracturls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import re

# download the latest lychee output from GitHub
input_file = "lycheeout.txt"
output_file = "urls.txt"

# Regular expression pattern to match the desired URLs
pattern = r"✔ \[200\] (https?://\S+)"

# Open the input and output files
with open(input_file, "r") as f_in, open(output_file, "w") as f_out:
# Read each line from the input file
for line in f_in:
# Find the URLs matching the pattern
match = re.search(pattern, line)
if match:
url = match.group(1)
# Write the URL to the output file
f_out.write(url + "\n")

print("URLs extracted and saved to 'urls.txt'.")
34 changes: 34 additions & 0 deletions scripts/archivable_urls/filterurls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import re

input_file = "urls.txt"
output_file = "filteredurls.txt"

# Regular expression patterns
exclude_pattern = r"https?://(web\.)?archive\.org"
include_pattern = r"(https?://(?!.*archive\.org)\S*?(\.html?|\.mp3|\.pdf)|https?://\S*?/download\S*)"

# Set to store unique URLs
unique_urls = set()

# Open the input file
with open(input_file, "r") as f_in:
# Read each line from the input file
for line in f_in:
# Exclude URLs matching the exclude pattern
if re.search(exclude_pattern, line):
continue

# Find the URLs matching the include pattern
match = re.search(include_pattern, line)
if match:
url = match.group(0)
# Add the URL to the set
unique_urls.add(url)

# Open the output file
with open(output_file, "w") as f_out:
# Write the unique URLs to the output file
for url in unique_urls:
f_out.write(url + "\n")

print("Filtered URLs (with duplicates removed) extracted and saved to 'filteredurls.txt'.")
70 changes: 55 additions & 15 deletions scripts/archive_site.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
"""Saves every page across the site to Archive.org's Wayback Machine"""

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from pathlib import Path
import json
import time
from datetime import datetime, timedelta
import os
import xml.etree.ElementTree as XML
try:
from tqdm import tqdm, trange
except:
print(" pip install tqdm")
quit(1)
ARCHIVE_ORG_AUTH_FILE = '~/archive.org.auth'

ARCHIVE_ORG_AUTH_PATH = Path(os.path.expanduser(ARCHIVE_ORG_AUTH_FILE))
Expand All @@ -17,6 +25,14 @@

SITEMAP_NAMESPACE = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"}

retry_strategy = Retry(total=2, backoff_factor=0.5)
http_adapter = HTTPAdapter(max_retries=retry_strategy)
archive_org_session = requests.Session()
archive_org_session.mount("https://", http_adapter)
archive_org_session.timeout = 5
archive_org_session.headers['Authorization'] = ARCHIVE_ORG_AUTH
archive_org_session.headers['Accept'] = 'application/json'

def all_urls_in_website(domain):
# Fetch the XML sitemap using requests
response = requests.get(domain+"/sitemap.xml")
Expand All @@ -28,17 +44,25 @@ def all_urls_in_website(domain):
# Find all URL elements using XPath
url_elements = root.findall(".//ns:url", SITEMAP_NAMESPACE)
togo = len(url_elements)
print(f"Found {togo} urls. Will take at least {(togo/600.0):.2f} hours")
# Extract and yield the URLs
for url_element in url_elements:
loc_element = url_element.find("ns:loc", SITEMAP_NAMESPACE)
if loc_element is not None:
yield loc_element.text

def last_archived_datetime(url):
resp = archive_org_session.head("https://web.archive.org/web/"+str(url))
if not resp.ok:
return None
if not 'x-archive-redirect-reason' in resp.headers:
return None
timestamp = resp.headers['x-archive-redirect-reason'].split(' at ')[1]
return datetime.strptime(timestamp, '%Y%m%d%H%M%S')

def save_url_to_archiveorg(url):
print(f"Saving {url} to the Wayback Machine now...")
try:
resp = requests.post("https://web.archive.org/save", data={"url": url}, headers={'Accept': 'application/json', 'Authorization': ARCHIVE_ORG_AUTH})
resp = archive_org_session.post("https://web.archive.org/save", data={"url": url})
except:
print("WARNING: A connection error occurred")
return False
Expand All @@ -49,35 +73,51 @@ def save_url_to_archiveorg(url):
print(f"WARNING: Save failed\n\t{resp.headers}\n\tCONTENT:\n\t{resp.text}")
return False

if __name__ == "__main__":
try:
from tqdm import tqdm, trange
except:
print(" pip install tqdm")
quit(1)
skip_past = "Last successful URL"
urls = list(all_urls_in_website("https://buddhistuniversity.net"))
def archive_urls(urls, skip_urls_archived_in_last_days=365):
successes = 0
def wait_secs(n):
print(f"Waiting {n} seconds...")
for i in trange(n):
time.sleep(1)
try:
skip_past = urls.index(skip_past)
urls = urls[skip_past+1:]
except:
pass
if skip_urls_archived_in_last_days:
now = datetime.now()
skipinterval = timedelta(days=skip_urls_archived_in_last_days)
def should_arch(url):
archtime = last_archived_datetime(url)
if not archtime:
return True
return now-archtime > skipinterval
else:
def should_arch(url):
return True
consecutive_failures = 0
for url in tqdm(urls):
if not should_arch(url):
print(f"Skipping {url}...")
continue
if not save_url_to_archiveorg(url):
consecutive_failures += 1
wait_secs(60)
if save_url_to_archiveorg(url):
successes += 1
consecutive_failures = 0
else:
consecutive_failures += 1
else:
successes += 1
consecutive_failures = 0
if consecutive_failures > 5:
print("ERROR: This doesn't seem to be working...")
quit(1)
wait_secs(5)
return successes

if __name__ == "__main__":
skip_past = "Last successful URL"
urls = list(all_urls_in_website("https://buddhistuniversity.net"))
try:
skip_past = urls.index(skip_past)
urls = urls[skip_past+1:]
except:
pass
archive_urls(urls)

0 comments on commit cc76b30

Please sign in to comment.