From e6f81e83fe05a089574ff4c231f21eac135cdcea Mon Sep 17 00:00:00 2001
From: Felix Yan <felixonmars@archlinux.org>
Date: Sat, 5 Jun 2021 04:01:12 +0800
Subject: [PATCH] Make web-slang generation two-passes and reproducible

---
 Makefile            | 13 +++++--
 zhwiki-web-slang.py | 93 ++++++++++++++++++++++++++++-----------------
 2 files changed, 68 insertions(+), 38 deletions(-)

diff --git a/Makefile b/Makefile
index 586e505..ec96ae7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,8 @@
-VERSION=20210101
+VERSION=20210601
+WEB_SLANG_VERSION=20210605
 FILENAME=zhwiki-$(VERSION)-all-titles-in-ns0
-WEB_SLANG_FILE=web-slang-$(VERSION).source
+WEB_SLANG_FILE=web-slang-$(WEB_SLANG_VERSION).txt
+WEB_SLANG_SOURCE=web-slang-$(WEB_SLANG_VERSION).source
 
 all: build
 
@@ -11,8 +13,11 @@ download: $(FILENAME).gz
 $(FILENAME).gz:
 	wget https://dumps.wikimedia.org/zhwiki/$(VERSION)/$(FILENAME).gz
 
-$(WEB_SLANG_FILE):
-	./zhwiki-web-slang.py > $(WEB_SLANG_FILE)
+$(WEB_SLANG_SOURCE):
+	./zhwiki-web-slang.py --fetch > $(WEB_SLANG_SOURCE)
+
+$(WEB_SLANG_FILE): $(WEB_SLANG_SOURCE)
+	./zhwiki-web-slang.py --process $(WEB_SLANG_SOURCE) > $(WEB_SLANG_FILE)
 
 $(FILENAME): $(FILENAME).gz
 	gzip -k -d $(FILENAME).gz
diff --git a/zhwiki-web-slang.py b/zhwiki-web-slang.py
index e8b069b..22604c5 100755
--- a/zhwiki-web-slang.py
+++ b/zhwiki-web-slang.py
@@ -5,46 +5,71 @@
 import urllib.parse
 import urllib.request
 import collections
+import sys
 
-_ZHWIKI_SOURCE_URL = "https://zh.wikipedia.org/w/api.php?action=parse&format=json&prop=wikitext&uselang=zh&formatversion=2&page="
-_PAGE = "中国大陆网络用语列表"
 
-page = urllib.request.urlopen(_ZHWIKI_SOURCE_URL + urllib.parse.quote(_PAGE)).read()
-wikitext = json.loads(page)["parse"]["wikitext"]
-words = collections.OrderedDict()
+def fetch():
+    _ZHWIKI_SOURCE_URL = "https://zh.wikipedia.org/w/api.php?action=parse&format=json&prop=wikitext&uselang=zh&formatversion=2&page="
+    _PAGE = "中国大陆网络用语列表"
 
+    page = urllib.request.urlopen(_ZHWIKI_SOURCE_URL + urllib.parse.quote(_PAGE)).read()
+    wikitext = json.loads(page)["parse"]["wikitext"]
+    return wikitext
 
-def add_word(word):
-    if word.startswith("形容"):
-        return
-    for garbage in ("、", "[", "]", "…"):
-        word = word.replace(garbage, "")
-    words[word.strip()] = None
 
+def process(wikitext):
+    words = collections.OrderedDict()
 
-def add_words(word):
-    for word_separator in ("、", "/", "|", "，", "。"):
-        if word_separator in word:
-            for w in word.split(word_separator):
-                # recursively resolve
-                add_words(w.strip())
-            break
-    else:
-        add_word(word)
-
+    def add_word(word):
+        if word.startswith("形容"):
+            return
+        for garbage in ("、", "[", "]", "…"):
+            word = word.replace(garbage, "")
+        words[word.strip()] = None
 
-for line in wikitext.split("\n"):
-    if line.startswith("*"):
-        # Lists
-        for table_separator in ("：", ":"):
-            if table_separator in line:
-                word = line.split(table_separator)[0].strip("*").strip()
-                add_words(word)
+    def add_words(word):
+        for word_separator in ("、", "/", "|", "，", "。"):
+            if word_separator in word:
+                for w in word.split(word_separator):
+                    # recursively resolve
+                    add_words(w.strip())
                 break
-    elif line.startswith("|"):
-        # Tables
-        word = line.split("|")[1]
-        add_words(word)
+        else:
+            add_word(word)
+
+    for line in wikitext.split("\n"):
+        if line.startswith("*"):
+            # Lists
+            for table_separator in ("：", ":"):
+                if table_separator in line:
+                    word = line.split(table_separator)[0].strip("*").strip()
+                    add_words(word)
+                    break
+        elif line.startswith("|"):
+            # Tables
+            word = line.split("|")[1]
+            add_words(word)
+
+    return words
+
+
+def print_words(words):
+    for word in words:
+        print(word)
 
-for word in words:
-    print(word)
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        wikitext = fetch()
+        words = process(wikitext)
+        print_words(words)
+
+    elif sys.argv[1] == "--fetch":
+        print(fetch())
+
+    elif sys.argv[1] == "--process":
+        wikitext = open(sys.argv[2]).read()
+        print_words(process(wikitext))
+
+    else:
+        raise NotImplementedError