From 83c721e0007204d6175da913208e5b1324472323 Mon Sep 17 00:00:00 2001
From: Felix Yan <felixonmars@archlinux.org>
Date: Tue, 26 May 2020 12:09:54 +0800
Subject: [PATCH] Attempt to filter out words with long common suffix that's
 already in the dict (#6)

---
 convert.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/convert.py b/convert.py
index 20c867f..ad505ab 100755
--- a/convert.py
+++ b/convert.py
@@ -3,12 +3,13 @@
 import re
 import opencc
 from pypinyin import lazy_pinyin
-converter = opencc.OpenCC('t2s.json')
 
 FILE = sys.argv[1]
 
+converter = opencc.OpenCC('t2s.json')
 HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$')
 count = 0
+last_word = None
 with open(FILE) as f:
     for line in f:
         line = line.rstrip("\n")
@@ -23,11 +24,16 @@
         if line.endswith('\u5217\u8868'):
             continue
 
+        if last_word and len(last_word) >= 4 and line.startswith(last_word):
+            continue
+
         pinyin = "'".join(lazy_pinyin(line))
         if pinyin == line:
             print("Failed to convert, ignoring:", pinyin, file=sys.stderr)
             continue
 
+        last_word = line
+
         print("\t".join((converter.convert(line), pinyin, "0")))
         count += 1
         if count % 1000 == 0: