From e6f81e83fe05a089574ff4c231f21eac135cdcea Mon Sep 17 00:00:00 2001 From: Felix Yan Date: Sat, 5 Jun 2021 04:01:12 +0800 Subject: [PATCH] Make web-slang generation two-passes and reproducible --- Makefile | 13 +++++-- zhwiki-web-slang.py | 93 ++++++++++++++++++++++++++++----------------- 2 files changed, 68 insertions(+), 38 deletions(-) diff --git a/Makefile b/Makefile index 586e505..ec96ae7 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,8 @@ -VERSION=20210101 +VERSION=20210601 +WEB_SLANG_VERSION=20210605 FILENAME=zhwiki-$(VERSION)-all-titles-in-ns0 -WEB_SLANG_FILE=web-slang-$(VERSION).source +WEB_SLANG_FILE=web-slang-$(WEB_SLANG_VERSION).txt +WEB_SLANG_SOURCE=web-slang-$(WEB_SLANG_VERSION).source all: build @@ -11,8 +13,11 @@ download: $(FILENAME).gz $(FILENAME).gz: wget https://dumps.wikimedia.org/zhwiki/$(VERSION)/$(FILENAME).gz -$(WEB_SLANG_FILE): - ./zhwiki-web-slang.py > $(WEB_SLANG_FILE) +$(WEB_SLANG_SOURCE): + ./zhwiki-web-slang.py --fetch > $(WEB_SLANG_SOURCE) + +$(WEB_SLANG_FILE): $(WEB_SLANG_SOURCE) + ./zhwiki-web-slang.py --process $(WEB_SLANG_SOURCE) > $(WEB_SLANG_FILE) $(FILENAME): $(FILENAME).gz gzip -k -d $(FILENAME).gz diff --git a/zhwiki-web-slang.py b/zhwiki-web-slang.py index e8b069b..22604c5 100755 --- a/zhwiki-web-slang.py +++ b/zhwiki-web-slang.py @@ -5,46 +5,71 @@ import urllib.parse import urllib.request import collections +import sys -_ZHWIKI_SOURCE_URL = "https://zh.wikipedia.org/w/api.php?action=parse&format=json&prop=wikitext&uselang=zh&formatversion=2&page=" -_PAGE = "中国大陆网络用语列表" -page = urllib.request.urlopen(_ZHWIKI_SOURCE_URL + urllib.parse.quote(_PAGE)).read() -wikitext = json.loads(page)["parse"]["wikitext"] -words = collections.OrderedDict() +def fetch(): + _ZHWIKI_SOURCE_URL = "https://zh.wikipedia.org/w/api.php?action=parse&format=json&prop=wikitext&uselang=zh&formatversion=2&page=" + _PAGE = "中国大陆网络用语列表" + page = urllib.request.urlopen(_ZHWIKI_SOURCE_URL + urllib.parse.quote(_PAGE)).read() + wikitext = json.loads(page)["parse"]["wikitext"] + return wikitext -def add_word(word): - if word.startswith("形容"): - return - for garbage in ("、", "[", "]", "…"): - word = word.replace(garbage, "") - words[word.strip()] = None +def process(wikitext): + words = collections.OrderedDict() -def add_words(word): - for word_separator in ("、", "/", "|", ",", "。"): - if word_separator in word: - for w in word.split(word_separator): - # recursively resolve - add_words(w.strip()) - break - else: - add_word(word) - + def add_word(word): + if word.startswith("形容"): + return + for garbage in ("、", "[", "]", "…"): + word = word.replace(garbage, "") + words[word.strip()] = None -for line in wikitext.split("\n"): - if line.startswith("*"): - # Lists - for table_separator in (":", ":"): - if table_separator in line: - word = line.split(table_separator)[0].strip("*").strip() - add_words(word) + def add_words(word): + for word_separator in ("、", "/", "|", ",", "。"): + if word_separator in word: + for w in word.split(word_separator): + # recursively resolve + add_words(w.strip()) break - elif line.startswith("|"): - # Tables - word = line.split("|")[1] - add_words(word) + else: + add_word(word) + + for line in wikitext.split("\n"): + if line.startswith("*"): + # Lists + for table_separator in (":", ":"): + if table_separator in line: + word = line.split(table_separator)[0].strip("*").strip() + add_words(word) + break + elif line.startswith("|"): + # Tables + word = line.split("|")[1] + add_words(word) + + return words + + +def print_words(words): + for word in words: + print(word) -for word in words: - print(word) + +if __name__ == "__main__": + if len(sys.argv) == 1: + wikitext = fetch() + words = process(wikitext) + print_words(words) + + elif sys.argv[1] == "--fetch": + print(fetch()) + + elif sys.argv[1] == "--process": + wikitext = open(sys.argv[2]).read() + print_words(process(wikitext)) + + else: + raise NotImplementedError