Revise documents for Ancient Chinese models

hankcs · Jan 13, 2025 · 0636dc6 · 0636dc6
1 parent f717be1
commit 0636dc6
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 21 deletions.
diff --git a/docs/references.bib b/docs/references.bib
@@ -1,13 +1,23 @@
 %% This BibTeX bibliography file was created using BibDesk.
 %% https://bibdesk.sourceforge.io/
 
-%% Created for hankcs at 2025-01-11 17:43:12 -0800 
+%% Created for hankcs at 2025-01-12 16:22:17 -0800 
 
 
 %% Saved with string encoding Unicode (UTF-8) 
 
 
 
+@inproceedings{yasuoka2019universal,
+	author = {Yasuoka, Koichi},
+	booktitle = {DADH2019: 10th International Conference of Digital Archives and Digital Humanities},
+	date-added = {2025-01-12 16:22:09 -0800},
+	date-modified = {2025-01-12 16:22:09 -0800},
+	organization = {Digital Archives and Digital Humanities},
+	pages = {20--28},
+	title = {Universal dependencies treebank of the four books in Classical Chinese},
+	year = {2019}}
+
 @inproceedings{li-etal-2022-first,
 	abstract = {This paper presents the results of the First Ancient Chinese Word Segmentation and POS Tagging Bakeoff (EvaHan), which was held at the Second Workshop on Language Technologies for Historical and Ancient Languages (LT4HALA) 2022, in the context of the 13th Edition of the Language Resources and Evaluation Conference (LREC 2022). We give the motivation for having an international shared contest, as well as the data and tracks. The contest is consisted of two modalities, closed and open. In the closed modality, the participants are only allowed to use the training data, obtained the highest F1 score of 96.03{\%} and 92.05{\%} in word segmentation and POS tagging. In the open modality, the participants can use whatever resource they have, with the highest F1 score of 96.34{\%} and 92.56{\%} in word segmentation and POS tagging. The scores on the blind test dataset decrease around 3 points, which shows that the out-of-vocabulary words still are the bottleneck for lexical analyzers.},
 	address = {Marseille, France},
@@ -24,19 +34,6 @@ @inproceedings{li-etal-2022-first
 	year = {2022},
 	bdsk-url-1 = {https://aclanthology.org/2022.lt4hala-1.19/}}
 
-@inproceedings{YASK:2019,
-	abstract = {Classical Chinese is an isolating language without notational inflection, and its texts are continuous strings of Chinese characters without spaces or punctuations between words or sentences. In order to apply Universal Dependencies for classical Chinese, we need several ``not-universal'' treatments and enhancements. In this paper such treatments and enhancements are revealed.},
-	author = {YASUOKA, Koichi},
-	date-added = {2025-01-11 17:39:18 -0800},
-	date-modified = {2025-01-11 17:39:18 -0800},
-	journal = {DADH2019: 10th International Conference of Digital Archives and Digital Humanities},
-	month = {12},
-	publisher = {Digital Archives and Digital Humanities},
-	title = {Universal Dependencies Treebank of the Four Books in Classical Chinese},
-	url = {http://hdl.handle.net/2433/245217},
-	year = {2019},
-	bdsk-url-1 = {http://hdl.handle.net/2433/245217}}
-
 @inproceedings{wang2022uncertainty,
 	author = {Wang, Pengyu and Ren, Zhichen},
 	booktitle = {Proceedings of the Second Workshop on Language Technologies for Historical and Ancient Languages},

diff --git a/hanlp/pretrained/mtl.py b/hanlp/pretrained/mtl.py
@@ -19,9 +19,11 @@
 CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip'
 "ERNIE (:cite:`xiao-etal-2021-ernie`) base version of joint tok, pos, ner, srl, dep, sdp and con model trained on close-source Chinese corpus."
 KYOTO_EVAHAN_TOK_LEM_POS_UDEP_LZH = HANLP_URL + 'mtl/kyoto_evahan_tok_lem_pos_udep_bert-ancient-chinese_lr_1_aug_dict_20250112_154422.zip'
-'Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese ' \
-'Universal Dependencies Treebank (:cite:`yasuoka2019universal`) and EvaHan corpus (:cite:`li-etal-2022-first`). ' \
-'Performance: {tok/fine P: 98.91% R: 99.11% F1: 99.01%}{tok/coarse P: 94.71% R: 92.51% F1: 93.60%}{lem Accuracy:98.86%}{pos/upos Accuracy:94.91%}{pos/xpos Accuracy:93.79%}{pos/pku Accuracy:91.91%}{dep UAS: 88.70% LAS: 83.89%}'
+'''
+Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese 
+Universal Dependencies Treebank (:cite:`yasuoka2019universal`) and EvaHan corpus (:cite:`li-etal-2022-first`).
+Performance: ``{tok/fine P: 98.91% R: 99.11% F1: 99.01%}{tok/coarse P: 94.71% R: 92.51% F1: 93.60%}{lem Accuracy:98.86%}{pos/upos Accuracy:94.91%}{pos/xpos Accuracy:93.79%}{pos/pku Accuracy:91.91%}{dep UAS: 88.70% LAS: 83.89%}``
+'''
 
 UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L6 = HANLP_URL + 'mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mMiniLMv2L6_no_space_20220731_161526.zip'
 '''

diff --git a/hanlp/pretrained/tok.py b/hanlp/pretrained/tok.py
@@ -35,10 +35,12 @@
 'which is much higher than that of MTL model '
 
 KYOTO_EVAHAN_TOK_LZH = 'http://download.hanlp.com/tok/extra/kyoto_evahan_tok_bert-ancient-chinese_tau_0.5_20250111_234146.zip'
-'Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese ' \
-'Universal Dependencies Treebank (:cite:`YASK:2019`) and EvaHan corpus (:cite:`li-etal-2022-first`). ' \
-'Performance: {UD P: 98.85% R: 99.00% F1: 98.92%} on UD Kyoto, ' \
-'and {TestA P: 95.62% R: 96.56% F1: 96.09%} {TestB P: 94.93% R: 93.05% F1: 93.98%} on EvaHan.'
+'''
+Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese
+Universal Dependencies Treebank (:cite:`yasuoka2019universal`) and EvaHan corpus (:cite:`li-etal-2022-first`).
+Performance: ``{UD P: 98.85% R: 99.00% F1: 98.92%}`` on UD Kyoto,
+and ``{TestA P: 95.62% R: 96.56% F1: 96.09%} {TestB P: 94.93% R: 93.05% F1: 93.98%}`` on EvaHan.
+'''
 
 UD_TOK_MMINILMV2L6 = HANLP_URL + 'tok/ud_tok_mMiniLMv2L6_no_space_mul_20220619_091824.zip'
 '''

diff --git a/plugins/hanlp_demo/hanlp_demo/lzh/__init__.py b/plugins/hanlp_demo/hanlp_demo/lzh/__init__.py
diff --git a/plugins/hanlp_demo/hanlp_demo/lzh/demo_mtl.py b/plugins/hanlp_demo/hanlp_demo/lzh/demo_mtl.py
@@ -0,0 +1,8 @@
+import hanlp
+
+HanLP = hanlp.load(hanlp.pretrained.mtl.KYOTO_EVAHAN_TOK_LEM_POS_UDEP_LZH)
+doc = HanLP(['晋太元中，武陵人捕鱼为业。', '司馬牛問君子'])
+print(doc)
+doc.pretty_print()
+
+HanLP('司馬牛問君子', skip_tasks='tok/fine').pretty_print()
diff --git a/plugins/hanlp_demo/hanlp_demo/lzh/demo_tok.py b/plugins/hanlp_demo/hanlp_demo/lzh/demo_tok.py
@@ -0,0 +1,5 @@
+import hanlp
+
+HanLP = hanlp.load(hanlp.pretrained.tok.KYOTO_EVAHAN_TOK_LZH)
+doc = HanLP('司馬牛問君子')
+print(doc)