From 83c721e0007204d6175da913208e5b1324472323 Mon Sep 17 00:00:00 2001 From: Felix Yan Date: Tue, 26 May 2020 12:09:54 +0800 Subject: [PATCH] Attempt to filter out words with long common suffix that's already in the dict (#6) --- convert.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/convert.py b/convert.py index 20c867f..ad505ab 100755 --- a/convert.py +++ b/convert.py @@ -3,12 +3,13 @@ import re import opencc from pypinyin import lazy_pinyin -converter = opencc.OpenCC('t2s.json') FILE = sys.argv[1] +converter = opencc.OpenCC('t2s.json') HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$') count = 0 +last_word = None with open(FILE) as f: for line in f: line = line.rstrip("\n") @@ -23,11 +24,16 @@ if line.endswith('\u5217\u8868'): continue + if last_word and len(last_word) >= 4 and line.startswith(last_word): + continue + pinyin = "'".join(lazy_pinyin(line)) if pinyin == line: print("Failed to convert, ignoring:", pinyin, file=sys.stderr) continue + last_word = line + print("\t".join((converter.convert(line), pinyin, "0"))) count += 1 if count % 1000 == 0: