Skip to content

Commit

Permalink
加入選擇【白話音】、【文讀音】查音自動化處理功能。
Browse files Browse the repository at this point in the history
  • Loading branch information
AlanJui committed Oct 12, 2024
1 parent 9e6c9c4 commit a817e2f
Show file tree
Hide file tree
Showing 14 changed files with 536 additions and 22 deletions.
40 changes: 40 additions & 0 deletions Documents/Sip-Ngoo-Im/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -311,3 +311,43 @@ UPDATE Han_Ji_Tian
SET 台語音標拼音 = SUBSTR(台語音標拼音, 2)
WHERE 台語音標拼音 LIKE 'q%';
```

## 台羅音標漢字庫

### 資料表結構(Schema)

```bash
CREATE TABLE 台羅音標漢字庫 (
識別號 INTEGER NOT NULL
UNIQUE,
漢字 TEXT,
台羅音標 TEXT,
常用度 TEXT,
摘要說明 TEXT,
建立時間 TEXT DEFAULT (DATETIME('now', 'localtime') )
NOT NULL,
更新時間 TEXT NOT NULL
DEFAULT (DATETIME('now', 'localtime') ),
PRIMARY KEY (
識別號 AUTOINCREMENT
)
);
```


### 資料更新觸發器

```bash
DROP TRIGGER IF EXISTS 紀錄更新觸發器;

CREATE TRIGGER 紀錄更新觸發器
AFTER UPDATE ON 台羅音標漢字庫
FOR EACH ROW
WHEN NEW.更新時間 = OLD.更新時間
BEGIN
UPDATE 台羅音標漢字庫
SET 更新時間 = DATETIME('now', 'localtime')
WHERE 識別號 = NEW.識別號;
END;
```

Binary file modified Nga_Siok_Thong_Sip_Ngoo_Im.db
Binary file not shown.
Binary file modified Tai_Loo_Han_Ji_Khoo.db
Binary file not shown.
18 changes: 16 additions & 2 deletions a702_查找及填入漢字標音.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@
from p702_Ca_Han_Ji_Thak_Im import ca_han_ji_thak_im
from p730_Tng_Sing_Bang_Iah import tng_sing_bang_iah


# ==========================================================
# 查詢語音類型,若未設定則預設為文讀音
# ==========================================================
def get_sound_type(wb):
try:
reading_type = wb.names['語音類型'].refers_to_range.value
except KeyError:
reading_type = "文讀音"
return reading_type


# 指定虛擬環境的 Python 路徑
venv_python = os.path.join(".venv", "Scripts", "python.exe") if sys.platform == "win32" else os.path.join(".venv", "bin", "python")

Expand Down Expand Up @@ -36,10 +48,12 @@
sheet.range('A1').select() # 將 A1 儲存格設為作用儲存格

# (2) A731: 自動為漢字查找讀音,並抄寫到漢字的上方(拼音)及下方(注音)。
ca_han_ji_thak_im(wb, '漢字注音', 'V3')
# type = '白話音'
type = get_sound_type(wb)
ca_han_ji_thak_im(wb, '漢字注音', 'V3', type)

# (3) A740: 將【漢字注音】工作表的內容,轉成 HTML 網頁檔案。
tng_sing_bang_iah(wb, '漢字注音', 'V3')
# tng_sing_bang_iah(wb, '漢字注音', 'V3')

# (4) A750: 將 Tai_Gi_Zu_Im_Bun.xlsx 檔案,依 env 工作表的設定,另存新檔到指定目錄。
try:
Expand Down
442 changes: 442 additions & 0 deletions docs/桃花源記_漢字注音.html

Large diffs are not rendered by default.

41 changes: 35 additions & 6 deletions mod_台羅音標漢字庫.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,29 @@ def close_db_connection(conn):
# 關閉數據庫連接
conn.close()


# ==========================================================
# 用 `漢字` 查詢《台語音標》的讀音資訊
# ==========================================================
def han_ji_ca_piau_im(cursor, han_ji):
def han_ji_ca_piau_im(cursor, han_ji, reading_type="文讀音"):
"""
根據漢字查詢其台羅音標及相關讀音資訊,並將台羅音標轉換為台語音標。
若資料紀錄在`常用度`欄位儲存值為空值(NULL),則將其視為 0,因此可排在查詢結果的最後。
:param cursor: 數據庫游標
:param han_ji: 欲查詢的漢字
:param reading_type: 查詢的讀音類型,可以是 "文讀音" 或 "白話音"
:return: 包含讀音資訊的字典列表,包含台語音標、聲母、韻母、聲調。
"""

query = """
if reading_type == "文讀音":
reading_condition = "常用度 >= 0.61"
elif reading_type == "白話音":
reading_condition = "常用度 <= 0.60"
else:
reading_condition = "1=1" # 查詢所有

query = f"""
SELECT
識別號,
漢字,
Expand All @@ -38,14 +47,34 @@ def han_ji_ca_piau_im(cursor, han_ji):
FROM
台羅音標漢字庫
WHERE
漢字 = ?
漢字 = ? AND ({reading_condition})
ORDER BY
COALESCE(常用度, 0) DESC;
"""

cursor.execute(query, (han_ji,))
results = cursor.fetchall()

# 如果沒有找到符合條件的讀音,則查詢所有讀音,並選擇常用度最高者
if not results:
query = """
SELECT
識別號,
漢字,
台羅音標,
常用度,
摘要說明
FROM
台羅音標漢字庫
WHERE
漢字 = ?
ORDER BY
COALESCE(常用度, 0) DESC
LIMIT 1;
"""
cursor.execute(query, (han_ji,))
results = cursor.fetchall()

# 定義【台羅音標】到【台語音標】的轉換規則
tai_luo_to_tai_gi_mapping = {
'tsh': 'c',
Expand Down Expand Up @@ -85,9 +114,9 @@ def han_ji_ca_piau_im(cursor, han_ji):
# 自「台語音標+」,分析出:聲母、韻母、聲調
# ==========================================================
def split_zu_im(zu_im):
# 先進行聲母轉換處理
zu_im = zu_im.replace("tsh", "c").replace("ch", "c") # 將 tsh, ch 轉換為 c
zu_im = zu_im.replace("ts", "z").replace("c", "z") # 將 ts, c 轉換為 z
# 聲母相容性轉換處理(將 tsh 轉換為 c;將 ts 轉換為 z)
zu_im = zu_im.replace("tsh", "c") # 將 tsh 轉換為 c
zu_im = zu_im.replace("ts", "z") # 將 ts 轉換為 z

# 定義聲母的正規表示式,包括常見的聲母,但不包括 m 和 ng
siann_bu_pattern = re.compile(r"(b|c|z|g|h|j|kh|k|l|m(?!\d)|ng(?!\d)|n|ph|p|s|th|t|Ø)")
Expand Down
Binary file modified output2/【河洛話注音】working.xlsx
Binary file not shown.
Binary file added output2/【河洛話注音】桃花源記.xlsx
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
6 changes: 3 additions & 3 deletions p702_Ca_Han_Ji_Thak_Im.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def is_valid_han_ji(char):
return char not in punctuation_marks


def ca_han_ji_thak_im(wb, sheet_name='漢字注音', cell='V3'):
def ca_han_ji_thak_im(wb, sheet_name='漢字注音', cell='V3', type="文讀音"):
# 顯示「已輸入之拼音字母及注音符號」
named_range = wb.names['顯示注音輸入'] # 選擇名為 "顯示注音輸入" 的命名範圍# 選擇名為 "顯示注音輸入" 的命名範圍
named_range.refers_to_range.value = True
Expand Down Expand Up @@ -122,8 +122,8 @@ def ca_han_ji_thak_im(wb, sheet_name='漢字注音', cell='V3'):
sheet.range((row - 1, col)).value = lo_ma_im_piau
sheet.range((row + 1, col)).value = zu_im_hu_ho
else:
# 查找漢字讀音
result = han_ji_ca_piau_im(cursor, han_ji)
# 查找漢字讀音 (type: 白話音、文讀音)
result = han_ji_ca_piau_im(cursor, han_ji, type)

# 取羅馬拼音和台語注音
if result:
Expand Down
11 changes: 0 additions & 11 deletions p711_TL_Tng_Zu_Im.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,7 @@ def TL_Tng_Zu_Im(siann_bu, un_bu, siann_tiau, cursor):
# - xform/ㆡ(ㄧ|ㆪ)/ㆢ$1/
#=======================================================================

# 取【韻母】的第一個注音符號
# first_un_bu_char = zu_im_un_bu[0] if zu_im_un_bu else ''
#
# 比對聲母是否為 ㄗ、ㄘ、ㄙ、ㆡ,且韻母的第一個符號是 ㄧ 或 ㆪ
# if zu_im_siann_bu == 'ㄗ' and (first_un_bu_char == 'ㄧ' or first_un_bu_char == 'ㆪ'):
# zu_im_siann_bu = 'ㄐ'
# elif zu_im_siann_bu == 'ㄘ' and (first_un_bu_char == 'ㄧ' or first_un_bu_char == 'ㆪ'):
# zu_im_siann_bu = 'ㄑ'
# elif zu_im_siann_bu == 'ㄙ' and (first_un_bu_char == 'ㄧ' or first_un_bu_char == 'ㆪ'):
# zu_im_siann_bu = 'ㄒ'
# elif zu_im_siann_bu == 'ㆡ' and (first_un_bu_char == 'ㄧ' or first_un_bu_char == 'ㆪ'):
# zu_im_siann_bu = 'ㆢ'
if siann_bu == 'z' and (un_bu[0] == 'i' or un_bu == 'inn'):
zu_im_siann_bu = 'ㄐ'
elif siann_bu == 'c' and (un_bu[0] == 'i' or un_bu == 'inn'):
Expand Down

0 comments on commit a817e2f

Please sign in to comment.