Skip to content

Commit

Permalink
更新雅俗通字典
Browse files Browse the repository at this point in the history
  • Loading branch information
AlanJui committed Mar 26, 2024
1 parent 5a0332f commit 98d0207
Show file tree
Hide file tree
Showing 11 changed files with 11,010 additions and 2 deletions.
Binary file added Nga_Siok_Thong_Ji_Tian.db
Binary file not shown.
3 changes: 2 additions & 1 deletion config_dev_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@
WAIT_TIME = 5 # seconds

# Database
DATABASE = '.\\Kong_Un.db'
# DATABASE = '.\\Kong_Un.db'
DATABASE = '.\\Nga_Siok_Thong_Ji_Tian.db'
Binary file modified docs/D110-01_雅俗通十五音字典轉換工具.xlsx
Binary file not shown.
Binary file modified docs/D110_彙集雅俗通十五音字典.xlsx
Binary file not shown.
3 changes: 2 additions & 1 deletion m300_查字典標注音.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import sys

import settings
from p100_tsa_ji_tian import main_run as tsa_ji_tian_tshue_tsu_im
# from p100_tsa_ji_tian import main_run as tsa_ji_tian_tshue_tsu_im
from p100_tsa_nga_siok_thong_ji_tian import main_run as tsa_ji_tian_tshue_tsu_im


def myfunc(argv):
Expand Down
Binary file modified output/Piau-Tsu-Im.xlsx
Binary file not shown.
196 changes: 196 additions & 0 deletions p100_tsa_nga_siok_thong_ji_tian.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
import re

import sqlite3
import xlwings as xw

# 專案全域常數
from config_dev_env import DATABASE

def main_run(CONVERT_FILE_NAME):
# ==========================================================
# 在「漢字注音表」B欄已有台羅拼音,需將之拆分成聲母、韻母、調號
# 聲母、韻母、調號,分別存放在 C、D、E 欄
# ==========================================================

# 指定提供來源的【檔案】
file_path = CONVERT_FILE_NAME
wb = xw.Book(file_path)

# 指定提供來源的【工作表】;及【總列數】
source_sheet = wb.sheets["漢字注音表"]
end_of_row_no = (
source_sheet.range("A" + str(source_sheet.cells.last_cell.row)).end("up").row
)
end_of_row_no = int(end_of_row_no) - 1
print(f"end_row = {end_of_row_no}")

# ==========================================================
# 備妥程式需使用之工作表
# ==========================================================
sheet_name_list = [
"缺字表",
"字庫表",
]
# ----------------------------------------------------------
# 檢查工作表是否已存在?
# 若已存在,則清除工作表內容;
# 若不存在,則新增工作表
# ----------------------------------------------------------
for sheet_name in sheet_name_list:
sheet = wb.sheets[sheet_name]
try:
sheet.select()
sheet.clear()
continue
except Exception as e:
# CommandError 的 Exception 發生日,表工作表不存在
# 新增程式需使用之工作表
print(e)
wb.sheets.add(name=sheet_name)

khiam_ji_piau = wb.sheets["缺字表"]
ji_khoo_piau = wb.sheets["字庫表"]

# ==========================================================
# 在「漢字注音表」B欄已有台羅拼音,需將之拆分成聲母、韻母、調號
# 聲母、韻母、調號,分別存放在 C、D、E 欄
# ==========================================================
han_ji_tsu_im_piau = wb.sheets["漢字注音表"]
han_ji_tsu_im_piau.select()

# =========================================================="
# 資料庫",
# =========================================================="
conn = sqlite3.connect(DATABASE)
db_cursor = conn.cursor()
source_index = 1 # index for source sheet
target_index = 1
ji_khoo_index = 1
khiam_ji_index = 1

while source_index <= end_of_row_no:
print(f"row = {source_index}")
# 自 source_sheet 取出一個「欲查注音的漢字」(beh_tshue_tsu_im_e_ji)
beh_tshue_tsu_im_e_ji = str(
source_sheet.range("A" + str(source_index)).value
).strip()

# =========================================================
# 如是空白或換行,處理換行
# =========================================================
if beh_tshue_tsu_im_e_ji == " " or beh_tshue_tsu_im_e_ji == "":
target_index += 1
source_index += 1
continue
elif beh_tshue_tsu_im_e_ji == "\n":
han_ji_tsu_im_piau.range("A" + str(target_index)).value = "\n"
target_index += 1
source_index += 1
continue

# =========================================================
# 若取出之字為標點符號,則跳過,並繼續取下一個漢字。
# =========================================================
piau_tiam_1 = r"[,、:;.。?!()「」【】《》“]"
piau_tiam_2 = r"[\uFF0C\uFF08-\uFF09\u2013-\u2014\u2026\\u2018-\u201D\u3000\u3001-\u303F\uFE50-\uFE5E]" # noqa: E501
# piau_tiam = r"[\u2013-\u2026\u3000-\u303F\uFE50-\uFF20]"
piau_tiam = f"{piau_tiam_1}|{piau_tiam_2}"
is_piau_tiam = re.search(piau_tiam, beh_tshue_tsu_im_e_ji, re.M | re.I)
if is_piau_tiam:
target_index += 1
source_index += 1
continue

# =========================================================
# 在【字庫】資料庫查找【注音碼】
# SQL 查詢指令:自字庫查找某漢字之注音碼
# =========================================================
# sql = select id, han_ji, tl_im, freq, siann, un, tiau
# from han_ji
# where han_ji='{search_han_ji}'
sql = (
"SELECT 識別號, 漢字, 切音, 字韻, 聲調, 原始拼音, 舒促聲, 聲, 韻, 調, 拼音碼, 雅俗通標音, 十五音標音, 常用度 "
"FROM 雅俗通字庫 "
f"WHERE 漢字='{beh_tshue_tsu_im_e_ji}' "
"ORDER BY 常用度 DESC;"
)
db_cursor.execute(sql)
ji_e_piau_im = db_cursor.fetchall()

# =========================================================
# 若是查不到漢字的注音碼,在【缺字表】做記錄
# =========================================================
if not ji_e_piau_im:
print(f"Can not find 【{beh_tshue_tsu_im_e_ji}】in Han-Ji-Khoo!!")
# 記錄【缺字表】的【列號】
khiam_ji_piau.range("A" + str(khiam_ji_index)).value = khiam_ji_index
# 記錄【缺字表】的【漢字】
khiam_ji_piau.range("B" + str(khiam_ji_index)).value = beh_tshue_tsu_im_e_ji
# 記錄【漢字注音表】的【列號】
khiam_ji_piau.range("C" + str(khiam_ji_index)).value = source_index
khiam_ji_index += 1
target_index += 1
source_index += 1
continue

# =========================================================
# 自【字庫】查到的【漢字】,取出:聲母、韻母、調號
# =========================================================
piau_im_tsong_soo = len(ji_e_piau_im)
han_ji_id = ji_e_piau_im[0][0]
tsu_im = ji_e_piau_im[0][10]
siann_bu = ji_e_piau_im[0][7]
un_bu = ji_e_piau_im[0][8]
tiau_ho = ji_e_piau_im[0][9]
freq = ji_e_piau_im[0][13]

# =========================================================
# 寫入:【漢字注音表】
# =========================================================
han_ji_tsu_im_piau.range("B" + str(target_index)).value = tsu_im
han_ji_tsu_im_piau.range("C" + str(target_index)).value = siann_bu
han_ji_tsu_im_piau.range("D" + str(target_index)).value = un_bu
han_ji_tsu_im_piau.range("E" + str(target_index)).value = tiau_ho
han_ji_tsu_im_piau.range("F" + str(target_index)).value = piau_im_tsong_soo
han_ji_tsu_im_piau.range("G" + str(target_index)).value = freq

# =========================================================
# 若是查到漢字有一個以上的注音碼,在【字庫表】做記錄
# ji_khoo_sheet = wb.sheets["字庫表"]
# =========================================================
if piau_im_tsong_soo > 1:
for piau_im_index in range(piau_im_tsong_soo):
han_ji_id = ji_e_piau_im[piau_im_index][0]
tsu_im = ji_e_piau_im[piau_im_index][2]
freq = ji_e_piau_im[piau_im_index][3]
siann_bu = ji_e_piau_im[piau_im_index][4]
un_bu = ji_e_piau_im[piau_im_index][5]
tiau_ho = ji_e_piau_im[piau_im_index][6]

# 記錄對映至【漢字注音表】的【列號】
ji_khoo_piau.range("A" + str(ji_khoo_index)).value = source_index

# 記錄【字庫】資料庫的【紀錄識別碼(Record ID of Table)】
ji_khoo_piau.range("B" + str(ji_khoo_index)).value = han_ji_id

ji_khoo_piau.range(
"C" + str(ji_khoo_index)
).value = beh_tshue_tsu_im_e_ji
ji_khoo_piau.range("D" + str(ji_khoo_index)).value = tsu_im
ji_khoo_piau.range("E" + str(ji_khoo_index)).value = siann_bu
ji_khoo_piau.range("F" + str(ji_khoo_index)).value = un_bu
ji_khoo_piau.range("G" + str(ji_khoo_index)).value = tiau_ho
ji_khoo_piau.range("H" + str(ji_khoo_index)).value = freq

ji_khoo_index += 1

# =========================================================
# 調整讀取來源;寫入標的各手標
# =========================================================
target_index += 1
source_index += 1

# ==========================================================
# 關閉資料庫
# ==========================================================
conn.close()
85 changes: 85 additions & 0 deletions tools/SQLite/Nga-Siok-Thong/聲母對照表.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
識別號,韻母碼,國際音標,白話字,台羅,閩拼,方音,十五音,十五音序,舒促聲,十五音識別碼
1,un,un,un,un,un,ㄨㄣ,君,1,舒聲,君舒
2,ut,ut̚,ut,ut,ut,ㄨㆵ,君,1,促聲,君促
3,ian,ian,ian,ian,ian,ㄧㄢ,堅,2,舒聲,堅舒
4,iat,iat̚,iat,iat,iat,ㄧㄚㆵ,堅,2,促聲,堅促
5,im,im,im,im,im,ㄧㆬ,金,3,舒聲,金舒
6,ip,ip̚,ip,ip,ip,一ㆴ,金,3,促聲,金促
7,ui,ui,ui,ui,ui,ㄨㄧ,規,4,舒聲,規舒
8,ee,ɛ,e,ee,e,ㄝ,嘉,5,舒聲,嘉舒
9,eeh,ɛ?,eeh,eeh,,ㄝㆷ,嘉,5,促聲,嘉促
10,an,an,an,an,an,ㄢ,干,6,舒聲,干舒
11,at,at̚,at,at,at,ㄚㆵ,干,6,促聲,干促
12,ong,ɔŋ,ong,ong,ong,ㆲ,公,7,舒聲,公舒
13,ok,ɔk̚,ok,ok,ok,ㆦㆻ,公,7,促聲,公促
14,uai,uai,oai,uai,uai,ㄨㄞ,乖,8,舒聲,乖舒
15,uaih,uai?,oaih,uaih,uaih,ㄨㄞㆷ,乖,8,促聲,乖促
16,ing,iŋ,eng,ing,ing,ㄧㄥ,經,9,舒聲,經舒
17,ik,ik̚,ek,ik,ik,ㄧㆻ,經,9,促聲,經促
18,uan,uan,oan,uan,uan,ㄨㄢ,觀,10,舒聲,觀舒
19,uat,uat̚,oat,uat,uat,ㄨㄚㆵ,觀,10,促聲,觀促
20,oo,ɔu,o͘,oo,oo,ㆦ,沽,11,舒聲,沽舒
21,iau,iau,iau,iau,iao,ㄧㄠ,嬌,12,舒聲,嬌舒
22,iauh,iau?,iauh,iauh,iaoh,ㄧㄠㆷ,嬌,12,促聲,嬌促
23,ei,ei,e,e,e,ㆤ,稽,13,舒聲,稽舒
24,iong,iɔŋ,iong,iong,iong,ㄧㆲ,恭,14,舒聲,恭舒
25,iok,iɔk̚,iok,iok,iok,ㄧㆦㆻ,恭,14,促聲,恭促
26,o,ə,o,o,o,ㄜ,高,15,舒聲,高舒
27,oh,ə?,oh,oh,oh,ㄜㆷ,高,15,促聲,高促
28,ai,ai,ai,ai,ai,ㄞ,皆,16,舒聲,皆舒
29,in,in,in,in,in,ㄧㄣ,巾,17,舒聲,巾舒
30,it,it̚,it,it,it,ㄧㆵ,巾,17,促聲,巾促
31,iang,iaŋ,iang,iang,iang,ㄧㄤ,姜,18,舒聲,姜舒
32,iak,iak̚,iak,iak,iak,ㄧㄚㆻ,姜,18,促聲,姜促
33,am,am,am,am,am,ㆰ,甘,19,舒聲,甘舒
34,ap,ap̚,ap,ap,ap,ㄚㆴ,甘,19,促聲,甘促
35,ua,ua,oa,ua,ua,ㄨㄚ,瓜,20,舒聲,瓜舒
36,uah,ua?,oah,uah,uah,ㄨㄚㆷ,瓜,20,促聲,瓜促
37,ang,aŋ,ang,ang,ang,ㄤ,江,21,舒聲,江舒
38,ak,ak̚,ak,ak,ak,ㄚㆻ,江,21,促聲,江促
39,iam,iam,iam,iam,iam,ㄧㆰ,兼,22,舒聲,兼舒
40,iap,iap̚,iap,iap,iap,ㄧㄚㆴ,兼,22,促聲,兼促
41,au,au,au,au,ao,ㄠ,交,23,舒聲,交舒
42,auh,au?,auh,auh,aoh,ㄠㆷ,交,23,促聲,交促
43,ia,ia,ia,ia,ia,ㄧㄚ,迦,24,舒聲,迦舒
44,iah,ia?,iah,iah,iah,ㄧㄚㆷ,迦,24,促聲,迦促
45,ue,ue,oe,ue,ue,ㄨㆤ,檜,25,舒聲,檜舒
46,ueh,ue?,oeh,ueh,ueh,ㄨㆤㆷ,檜,25,促聲,檜促
47,ann,ã,aⁿ,ann,na,ㆩ,監,26,舒聲,監舒
48,ahnn,ã?,aⁿh,annh,nah,ㆩㆷ,監,26,促聲,監促
49,u,u,u,u,u,ㄨ,艍,27,舒聲,艍舒
50,uh,u?,uh,uh,uh,ㄨㆷ,艍,27,促聲,艍促
51,a,a,a,a,a,ㄚ,膠,28,舒聲,膠舒
52,ah,a?,ah,ah,ah,ㄚㆷ,膠,28,促聲,膠促
53,i,i,i,i,i,ㄧ,居,29,舒聲,居舒
54,ih,i?,ih,ih,ih,ㄧㆷ,居,29,促聲,居促
55,iu,iu,iu,iu,iu,ㄧㄨ,丩,30,舒聲,丩舒
56,enn,ẽ,eⁿ,enn,ne,ㆥ,更,31,舒聲,更舒
57,ehnn,ẽ?,eⁿh,ennh,neh,ㆥㆷ,更,31,促聲,更促
58,uinn,uĩ,uiⁿ,uinn,nui,ㄨㆪ,褌,32,舒聲,褌舒
59,io,iə,io,io,io,ㄧㄜ,茄,33,舒聲,茄舒
60,ioh,iə?,ioh,ioh,ioh,ㄧㄜㆷ,茄,33,促聲,茄促
61,inn,ĩ,iⁿ,inn,ni,ㆪ,梔,34,舒聲,梔舒
62,ihnn,ĩ?,iⁿh,innh,nih,ㆪㆷ,梔,34,促聲,梔促
63,ionn,ĩɔ̃,ioⁿ,ionn,nioo,ㄧㆧ,薑,35,舒聲,薑舒
64,iann,iã,iaⁿ,iannh,nia,ㄧㆩ,驚,36,舒聲,驚舒
65,uann,ũã,oaⁿ,uann,nua,ㄨㆩ,官,37,舒聲,官舒
66,ng,ŋ̍,ng,ng,ng,ㆭ,鋼,38,舒聲,鋼舒
67,e,e,e,e,e,ㆤ,伽,39,舒聲,伽舒
68,eh,e?,eh,eh,eh,ㆤㆷ,伽,39,促聲,伽促
69,ainn,ãĩ,aiⁿ,ainn,nai,ㆮ,閒,40,舒聲,閒舒
70,oonn,ɔ̃ũ,oⁿ,onn,noo,ㆧ,姑,41,舒聲,姑舒
71,m,m̩,m,m,m,ㆬ,姆,42,舒聲,姆舒
72,uang,uaŋ,oang,uang,uang,ㄨㄤ,光,43,舒聲,光舒
73,uak,uak̚,oak,uak,uak,ㄨㄚㆻ,光,43,促聲,光促
74,uainn,uãĩ,oaiⁿ,uainn,nuai,ㄨㆮ,閂,44,舒聲,閂舒
75,uaihnn,uãĩ?,oaiⁿh,uainnh,nuaih,ㄨㆮㆷ,閂,44,促聲,閂促
76,uenn,uẽ,oeⁿ,uenn,nue,ㄨㆥ,糜,45,舒聲,糜舒
77,iaunn,ĩãũ,iauⁿ,iaunn,niao,ㄧㆯ,嘄,46,舒聲,嘄舒
78,iauhnn,ĩãũ?,iauⁿh,iaunnh,niaoh,ㄧㆯㆷ,嘄,46,促聲,嘄促
79,om,ɔm,om,om,om,ㆱ,箴,47,舒聲,箴舒
80,op,ɔp̚,op,op,op,ㆦㆴ,箴,47,促聲,箴促
81,aunn,ãũ,auⁿ,aunn,nao,ㆯ,爻,48,舒聲,爻舒
82,onn,õ,oⁿ,onn,noo,ㆧ,扛,49,舒聲,扛舒
83,ohnn,õh,oⁿh,onnh,nooh,ㆧㆷ,扛,49,促聲,扛促
84,iunn,iũ,iuⁿ,iunn,niu,ㄧㆫ,牛,50,舒聲,牛舒
Binary file not shown.
Loading

0 comments on commit 98d0207

Please sign in to comment.