From 0636dc612df01765359cdcb3e4f7e53fadf34530 Mon Sep 17 00:00:00 2001 From: hankcs <jfservice@126.com> Date: Sun, 12 Jan 2025 16:28:42 -0800 Subject: [PATCH] Revise documents for Ancient Chinese models --- docs/references.bib | 25 ++++++++----------- hanlp/pretrained/mtl.py | 8 +++--- hanlp/pretrained/tok.py | 10 +++++--- plugins/hanlp_demo/hanlp_demo/lzh/__init__.py | 0 plugins/hanlp_demo/hanlp_demo/lzh/demo_mtl.py | 8 ++++++ plugins/hanlp_demo/hanlp_demo/lzh/demo_tok.py | 5 ++++ 6 files changed, 35 insertions(+), 21 deletions(-) create mode 100644 plugins/hanlp_demo/hanlp_demo/lzh/__init__.py create mode 100644 plugins/hanlp_demo/hanlp_demo/lzh/demo_mtl.py create mode 100644 plugins/hanlp_demo/hanlp_demo/lzh/demo_tok.py diff --git a/docs/references.bib b/docs/references.bib index 5784d4b4c..7521c0486 100644 --- a/docs/references.bib +++ b/docs/references.bib @@ -1,13 +1,23 @@ %% This BibTeX bibliography file was created using BibDesk. %% https://bibdesk.sourceforge.io/ -%% Created for hankcs at 2025-01-11 17:43:12 -0800 +%% Created for hankcs at 2025-01-12 16:22:17 -0800 %% Saved with string encoding Unicode (UTF-8) +@inproceedings{yasuoka2019universal, + author = {Yasuoka, Koichi}, + booktitle = {DADH2019: 10th International Conference of Digital Archives and Digital Humanities}, + date-added = {2025-01-12 16:22:09 -0800}, + date-modified = {2025-01-12 16:22:09 -0800}, + organization = {Digital Archives and Digital Humanities}, + pages = {20--28}, + title = {Universal dependencies treebank of the four books in Classical Chinese}, + year = {2019}} + @inproceedings{li-etal-2022-first, abstract = {This paper presents the results of the First Ancient Chinese Word Segmentation and POS Tagging Bakeoff (EvaHan), which was held at the Second Workshop on Language Technologies for Historical and Ancient Languages (LT4HALA) 2022, in the context of the 13th Edition of the Language Resources and Evaluation Conference (LREC 2022). We give the motivation for having an international shared contest, as well as the data and tracks. The contest is consisted of two modalities, closed and open. In the closed modality, the participants are only allowed to use the training data, obtained the highest F1 score of 96.03{\%} and 92.05{\%} in word segmentation and POS tagging. In the open modality, the participants can use whatever resource they have, with the highest F1 score of 96.34{\%} and 92.56{\%} in word segmentation and POS tagging. The scores on the blind test dataset decrease around 3 points, which shows that the out-of-vocabulary words still are the bottleneck for lexical analyzers.}, address = {Marseille, France}, @@ -24,19 +34,6 @@ @inproceedings{li-etal-2022-first year = {2022}, bdsk-url-1 = {https://aclanthology.org/2022.lt4hala-1.19/}} -@inproceedings{YASK:2019, - abstract = {Classical Chinese is an isolating language without notational inflection, and its texts are continuous strings of Chinese characters without spaces or punctuations between words or sentences. In order to apply Universal Dependencies for classical Chinese, we need several ``not-universal'' treatments and enhancements. In this paper such treatments and enhancements are revealed.}, - author = {YASUOKA, Koichi}, - date-added = {2025-01-11 17:39:18 -0800}, - date-modified = {2025-01-11 17:39:18 -0800}, - journal = {DADH2019: 10th International Conference of Digital Archives and Digital Humanities}, - month = {12}, - publisher = {Digital Archives and Digital Humanities}, - title = {Universal Dependencies Treebank of the Four Books in Classical Chinese}, - url = {http://hdl.handle.net/2433/245217}, - year = {2019}, - bdsk-url-1 = {http://hdl.handle.net/2433/245217}} - @inproceedings{wang2022uncertainty, author = {Wang, Pengyu and Ren, Zhichen}, booktitle = {Proceedings of the Second Workshop on Language Technologies for Historical and Ancient Languages}, diff --git a/hanlp/pretrained/mtl.py b/hanlp/pretrained/mtl.py index fda740c68..cfe9eea0e 100644 --- a/hanlp/pretrained/mtl.py +++ b/hanlp/pretrained/mtl.py @@ -19,9 +19,11 @@ CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip' "ERNIE (:cite:`xiao-etal-2021-ernie`) base version of joint tok, pos, ner, srl, dep, sdp and con model trained on close-source Chinese corpus." KYOTO_EVAHAN_TOK_LEM_POS_UDEP_LZH = HANLP_URL + 'mtl/kyoto_evahan_tok_lem_pos_udep_bert-ancient-chinese_lr_1_aug_dict_20250112_154422.zip' -'Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese ' \ -'Universal Dependencies Treebank (:cite:`yasuoka2019universal`) and EvaHan corpus (:cite:`li-etal-2022-first`). ' \ -'Performance: {tok/fine P: 98.91% R: 99.11% F1: 99.01%}{tok/coarse P: 94.71% R: 92.51% F1: 93.60%}{lem Accuracy:98.86%}{pos/upos Accuracy:94.91%}{pos/xpos Accuracy:93.79%}{pos/pku Accuracy:91.91%}{dep UAS: 88.70% LAS: 83.89%}' +''' +Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese +Universal Dependencies Treebank (:cite:`yasuoka2019universal`) and EvaHan corpus (:cite:`li-etal-2022-first`). +Performance: ``{tok/fine P: 98.91% R: 99.11% F1: 99.01%}{tok/coarse P: 94.71% R: 92.51% F1: 93.60%}{lem Accuracy:98.86%}{pos/upos Accuracy:94.91%}{pos/xpos Accuracy:93.79%}{pos/pku Accuracy:91.91%}{dep UAS: 88.70% LAS: 83.89%}`` +''' UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L6 = HANLP_URL + 'mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mMiniLMv2L6_no_space_20220731_161526.zip' ''' diff --git a/hanlp/pretrained/tok.py b/hanlp/pretrained/tok.py index 83e0236d5..7629848e6 100644 --- a/hanlp/pretrained/tok.py +++ b/hanlp/pretrained/tok.py @@ -35,10 +35,12 @@ 'which is much higher than that of MTL model ' KYOTO_EVAHAN_TOK_LZH = 'http://download.hanlp.com/tok/extra/kyoto_evahan_tok_bert-ancient-chinese_tau_0.5_20250111_234146.zip' -'Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese ' \ -'Universal Dependencies Treebank (:cite:`YASK:2019`) and EvaHan corpus (:cite:`li-etal-2022-first`). ' \ -'Performance: {UD P: 98.85% R: 99.00% F1: 98.92%} on UD Kyoto, ' \ -'and {TestA P: 95.62% R: 96.56% F1: 96.09%} {TestB P: 94.93% R: 93.05% F1: 93.98%} on EvaHan.' +''' +Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese +Universal Dependencies Treebank (:cite:`yasuoka2019universal`) and EvaHan corpus (:cite:`li-etal-2022-first`). +Performance: ``{UD P: 98.85% R: 99.00% F1: 98.92%}`` on UD Kyoto, +and ``{TestA P: 95.62% R: 96.56% F1: 96.09%} {TestB P: 94.93% R: 93.05% F1: 93.98%}`` on EvaHan. +''' UD_TOK_MMINILMV2L6 = HANLP_URL + 'tok/ud_tok_mMiniLMv2L6_no_space_mul_20220619_091824.zip' ''' diff --git a/plugins/hanlp_demo/hanlp_demo/lzh/__init__.py b/plugins/hanlp_demo/hanlp_demo/lzh/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/plugins/hanlp_demo/hanlp_demo/lzh/demo_mtl.py b/plugins/hanlp_demo/hanlp_demo/lzh/demo_mtl.py new file mode 100644 index 000000000..36624c3ff --- /dev/null +++ b/plugins/hanlp_demo/hanlp_demo/lzh/demo_mtl.py @@ -0,0 +1,8 @@ +import hanlp + +HanLP = hanlp.load(hanlp.pretrained.mtl.KYOTO_EVAHAN_TOK_LEM_POS_UDEP_LZH) +doc = HanLP(['晋太元中,武陵人捕鱼为业。', '司馬牛問君子']) +print(doc) +doc.pretty_print() + +HanLP('司馬牛問君子', skip_tasks='tok/fine').pretty_print() diff --git a/plugins/hanlp_demo/hanlp_demo/lzh/demo_tok.py b/plugins/hanlp_demo/hanlp_demo/lzh/demo_tok.py new file mode 100644 index 000000000..c2a1aa0f9 --- /dev/null +++ b/plugins/hanlp_demo/hanlp_demo/lzh/demo_tok.py @@ -0,0 +1,5 @@ +import hanlp + +HanLP = hanlp.load(hanlp.pretrained.tok.KYOTO_EVAHAN_TOK_LZH) +doc = HanLP('司馬牛問君子') +print(doc)