-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_dict.py
37 lines (31 loc) · 1.07 KB
/
preprocess_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import json
import re
default_filename = "term_meta_bank_1.json"
filename = input(f"Enter the filename (default: {default_filename}): ")
if not filename:
filename = default_filename
items = []
def remove_non_kanji(text):
text = re.sub(r'[A-Za-z0-9]', '', text)
chars = [c for c in text if "\u4e00" <= c <= "\u9fff"]
return "".join(chars)
with open(filename, mode="r", encoding="utf-8") as f:
data = json.load(f)
for row in data:
if row[2].get("reading") is None:
continue
term = row[0]
term_data = row[2]
reading = term_data["reading"]
kanji_string = remove_non_kanji(term)
freq = term_data["frequency"]["value"]
if len(kanji_string) > 0:
items.append([term, kanji_string, reading, freq])
print("Output:")
for i in range(20):
print(items[i])
print(f"Total terms: {len(data)}")
print(f"Total terms with kanji: {len(items)}")
with open("dict.json", mode="w", encoding="utf-8") as f:
json.dump(items, f, ensure_ascii=False, separators=(',', ':'))
print("Done.")