From 4916116d28fb958009e8d446533ce6635eedc4de Mon Sep 17 00:00:00 2001
From: Weisi Dai <weisi@x-research.com>
Date: Mon, 25 May 2020 23:07:19 -0700
Subject: [PATCH 1/2] Minor refactoring.

---
 convert.py | 96 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 65 insertions(+), 31 deletions(-)

diff --git a/convert.py b/convert.py
index 4dd9a60..3abf660 100755
--- a/convert.py
+++ b/convert.py
@@ -1,44 +1,78 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-import sys
+# Usage:
+#   convert.py input_filename
+# input_filename is a file of Wikipedia article titles, one title per line.
+
+import logging
 import re
+import sys
+
 import opencc
 from pypinyin import lazy_pinyin
 
-FILE = sys.argv[1]
+# Require at least 2 characters
+_MINIMUM_LEN = 2
+_LIST_PAGE_ENDINGS = [
+    '列表',
+    '对照表',
+]
+_LOG_EVERY = 1000
+
+_PINYIN_SEPARATOR = '\''
+_HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$')
+_TO_SIMPLIFIED_CHINESE = opencc.OpenCC('t2s.json')
+
+logging.basicConfig(level=logging.INFO)
+
+
+def is_good_title(title, previous_title=None):
+    if not _HANZI_RE.match(title):
+        return False
+
+    # Skip single character & too long pages
+    if len(title) < _MINIMUM_LEN:
+        return False
+
+    # Skip list pages
+    if title.endswith(tuple(_LIST_PAGE_ENDINGS)):
+        return False
+
+    if previous_title and \
+      len(previous_title) >= 4 and \
+      title.startswith(previous_title):
+        return False
+
+    return True
 
-converter = opencc.OpenCC('t2s.json')
-HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$')
-count = 0
-last_word = None
-with open(FILE) as f:
-    for line in f:
-        line = line.rstrip("\n")
-        if not HANZI_RE.match(line):
-            continue
 
-        # Skip single character & too long pages
-        if not 1 < len(line):
-            continue
+def log_count(count):
+    if count % _LOG_EVERY == 0:
+        logging.info(f'{count} words generated')
 
-        # Skip list pages
-        if line.endswith(('列表', '对照表')):
-            continue
 
-        if last_word and len(last_word) >= 4 and line.startswith(last_word):
-            continue
+def make_output(word, pinyin):
+    return '\t'.join([word, pinyin, '0'])
 
-        pinyin = "'".join(lazy_pinyin(line))
-        if pinyin == line:
-            print("Failed to convert, ignoring:", pinyin, file=sys.stderr)
-            continue
 
-        last_word = line
+def main():
+    previous_title = None
+    result_count = 0
+    with open(sys.argv[1]) as f:
+        for line in f:
+            title = _TO_SIMPLIFIED_CHINESE.convert(line.strip())
+            if is_good_title(title, previous_title):
+                pinyin = _PINYIN_SEPARATOR.join(lazy_pinyin(title))
+                if pinyin == title:
+                    logging.info(
+                        f'Failed to convert to Pinyin. Ignoring: {pinyin}')
+                    continue
+                print(make_output(title, pinyin))
+                result_count += 1
+                log_count(result_count)
+                previous_title = title
+    log_count(result_count)
 
-        print("\t".join((converter.convert(line), pinyin, "0")))
-        count += 1
-        if count % 1000 == 0:
-            print(str(count) + " converted", file=sys.stderr)
 
-if count % 1000 != 0:
-    print(str(count) + " converted", file=sys.stderr)
+if __name__ == '__main__':
+    main()

From b7a8bd518a6e072c57cf2c832b58b17544af9cc4 Mon Sep 17 00:00:00 2001
From: Weisi Dai <weisi@x-research.com>
Date: Mon, 25 May 2020 23:52:42 -0700
Subject: [PATCH 2/2] Fix log count handling.

---
 convert.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/convert.py b/convert.py
index 3abf660..4d2b198 100755
--- a/convert.py
+++ b/convert.py
@@ -47,8 +47,7 @@ def is_good_title(title, previous_title=None):
 
 
 def log_count(count):
-    if count % _LOG_EVERY == 0:
-        logging.info(f'{count} words generated')
+    logging.info(f'{count} words generated')
 
 
 def make_output(word, pinyin):
@@ -69,7 +68,8 @@ def main():
                     continue
                 print(make_output(title, pinyin))
                 result_count += 1
-                log_count(result_count)
+                if result_count % _LOG_EVERY == 0:
+                    log_count(result_count)
                 previous_title = title
     log_count(result_count)