Skip to content

Commit

Permalink
Attempt to filter out words with long common suffix that's already in…
Browse files Browse the repository at this point in the history
… the dict (#6)
  • Loading branch information
felixonmars committed May 26, 2020
1 parent 1ccfd21 commit 83c721e
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
import re
import opencc
from pypinyin import lazy_pinyin
converter = opencc.OpenCC('t2s.json')

FILE = sys.argv[1]

converter = opencc.OpenCC('t2s.json')
HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$')
count = 0
last_word = None
with open(FILE) as f:
for line in f:
line = line.rstrip("\n")
Expand All @@ -23,11 +24,16 @@
if line.endswith('\u5217\u8868'):
continue

if last_word and len(last_word) >= 4 and line.startswith(last_word):
continue

pinyin = "'".join(lazy_pinyin(line))
if pinyin == line:
print("Failed to convert, ignoring:", pinyin, file=sys.stderr)
continue

last_word = line

print("\t".join((converter.convert(line), pinyin, "0")))
count += 1
if count % 1000 == 0:
Expand Down

0 comments on commit 83c721e

Please sign in to comment.