Merge pull request #7 from weisi/master

Minor refactoring.
felixonmars · May 26, 2020 · 2a41f14 · 2a41f14
2 parents 732e063 + b7a8bd5
commit 2a41f14
Showing 1 changed file with 65 additions and 31 deletions.
diff --git a/convert.py b/convert.py
@@ -1,44 +1,78 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-import sys
+# Usage:
+#   convert.py input_filename
+# input_filename is a file of Wikipedia article titles, one title per line.
+
+import logging
 import re
+import sys
+
 import opencc
 from pypinyin import lazy_pinyin
 
-FILE = sys.argv[1]
+# Require at least 2 characters
+_MINIMUM_LEN = 2
+_LIST_PAGE_ENDINGS = [
+    '列表',
+    '对照表',
+]
+_LOG_EVERY = 1000
+
+_PINYIN_SEPARATOR = '\''
+_HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$')
+_TO_SIMPLIFIED_CHINESE = opencc.OpenCC('t2s.json')
+
+logging.basicConfig(level=logging.INFO)
+
+
+def is_good_title(title, previous_title=None):
+    if not _HANZI_RE.match(title):
+        return False
+
+    # Skip single character & too long pages
+    if len(title) < _MINIMUM_LEN:
+        return False
+
+    # Skip list pages
+    if title.endswith(tuple(_LIST_PAGE_ENDINGS)):
+        return False
+
+    if previous_title and \
+      len(previous_title) >= 4 and \
+      title.startswith(previous_title):
+        return False
+
+    return True
 
-converter = opencc.OpenCC('t2s.json')
-HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$')
-count = 0
-last_word = None
-with open(FILE) as f:
-    for line in f:
-        line = line.rstrip("\n")
-        if not HANZI_RE.match(line):
-            continue
 
-        # Skip single character & too long pages
-        if not 1 < len(line):
-            continue
+def log_count(count):
+    logging.info(f'{count} words generated')
 
-        # Skip list pages
-        if line.endswith(('列表', '对照表')):
-            continue
 
-        if last_word and len(last_word) >= 4 and line.startswith(last_word):
-            continue
+def make_output(word, pinyin):
+    return '\t'.join([word, pinyin, '0'])
 
-        pinyin = "'".join(lazy_pinyin(line))
-        if pinyin == line:
-            print("Failed to convert, ignoring:", pinyin, file=sys.stderr)
-            continue
 
-        last_word = line
+def main():
+    previous_title = None
+    result_count = 0
+    with open(sys.argv[1]) as f:
+        for line in f:
+            title = _TO_SIMPLIFIED_CHINESE.convert(line.strip())
+            if is_good_title(title, previous_title):
+                pinyin = _PINYIN_SEPARATOR.join(lazy_pinyin(title))
+                if pinyin == title:
+                    logging.info(
+                        f'Failed to convert to Pinyin. Ignoring: {pinyin}')
+                    continue
+                print(make_output(title, pinyin))
+                result_count += 1
+                if result_count % _LOG_EVERY == 0:
+                    log_count(result_count)
+                previous_title = title
+    log_count(result_count)
 
-        print("\t".join((converter.convert(line), pinyin, "0")))
-        count += 1
-        if count % 1000 == 0:
-            print(str(count) + " converted", file=sys.stderr)
 
-if count % 1000 != 0:
-    print(str(count) + " converted", file=sys.stderr)
+if __name__ == '__main__':
+    main()