Skip to content

Commit

Permalink
Merge pull request #7 from weisi/master
Browse files Browse the repository at this point in the history
Minor refactoring.
  • Loading branch information
felixonmars authored May 26, 2020
2 parents 732e063 + b7a8bd5 commit 2a41f14
Showing 1 changed file with 65 additions and 31 deletions.
96 changes: 65 additions & 31 deletions convert.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,78 @@
#!/usr/bin/python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
# Usage:
# convert.py input_filename
# input_filename is a file of Wikipedia article titles, one title per line.

import logging
import re
import sys

import opencc
from pypinyin import lazy_pinyin

FILE = sys.argv[1]
# Require at least 2 characters
_MINIMUM_LEN = 2
_LIST_PAGE_ENDINGS = [
'列表',
'对照表',
]
_LOG_EVERY = 1000

_PINYIN_SEPARATOR = '\''
_HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$')
_TO_SIMPLIFIED_CHINESE = opencc.OpenCC('t2s.json')

logging.basicConfig(level=logging.INFO)


def is_good_title(title, previous_title=None):
if not _HANZI_RE.match(title):
return False

# Skip single character & too long pages
if len(title) < _MINIMUM_LEN:
return False

# Skip list pages
if title.endswith(tuple(_LIST_PAGE_ENDINGS)):
return False

if previous_title and \
len(previous_title) >= 4 and \
title.startswith(previous_title):
return False

return True

converter = opencc.OpenCC('t2s.json')
HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$')
count = 0
last_word = None
with open(FILE) as f:
for line in f:
line = line.rstrip("\n")
if not HANZI_RE.match(line):
continue

# Skip single character & too long pages
if not 1 < len(line):
continue
def log_count(count):
logging.info(f'{count} words generated')

# Skip list pages
if line.endswith(('列表', '对照表')):
continue

if last_word and len(last_word) >= 4 and line.startswith(last_word):
continue
def make_output(word, pinyin):
return '\t'.join([word, pinyin, '0'])

pinyin = "'".join(lazy_pinyin(line))
if pinyin == line:
print("Failed to convert, ignoring:", pinyin, file=sys.stderr)
continue

last_word = line
def main():
previous_title = None
result_count = 0
with open(sys.argv[1]) as f:
for line in f:
title = _TO_SIMPLIFIED_CHINESE.convert(line.strip())
if is_good_title(title, previous_title):
pinyin = _PINYIN_SEPARATOR.join(lazy_pinyin(title))
if pinyin == title:
logging.info(
f'Failed to convert to Pinyin. Ignoring: {pinyin}')
continue
print(make_output(title, pinyin))
result_count += 1
if result_count % _LOG_EVERY == 0:
log_count(result_count)
previous_title = title
log_count(result_count)

print("\t".join((converter.convert(line), pinyin, "0")))
count += 1
if count % 1000 == 0:
print(str(count) + " converted", file=sys.stderr)

if count % 1000 != 0:
print(str(count) + " converted", file=sys.stderr)
if __name__ == '__main__':
main()

0 comments on commit 2a41f14

Please sign in to comment.