Skip to content

Commit

Permalink
Revise documents for Ancient Chinese models
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Jan 13, 2025
1 parent f717be1 commit 0636dc6
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 21 deletions.
25 changes: 11 additions & 14 deletions docs/references.bib
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
%% This BibTeX bibliography file was created using BibDesk.
%% https://bibdesk.sourceforge.io/
%% Created for hankcs at 2025-01-11 17:43:12 -0800
%% Created for hankcs at 2025-01-12 16:22:17 -0800
%% Saved with string encoding Unicode (UTF-8)
@inproceedings{yasuoka2019universal,
author = {Yasuoka, Koichi},
booktitle = {DADH2019: 10th International Conference of Digital Archives and Digital Humanities},
date-added = {2025-01-12 16:22:09 -0800},
date-modified = {2025-01-12 16:22:09 -0800},
organization = {Digital Archives and Digital Humanities},
pages = {20--28},
title = {Universal dependencies treebank of the four books in Classical Chinese},
year = {2019}}

@inproceedings{li-etal-2022-first,
abstract = {This paper presents the results of the First Ancient Chinese Word Segmentation and POS Tagging Bakeoff (EvaHan), which was held at the Second Workshop on Language Technologies for Historical and Ancient Languages (LT4HALA) 2022, in the context of the 13th Edition of the Language Resources and Evaluation Conference (LREC 2022). We give the motivation for having an international shared contest, as well as the data and tracks. The contest is consisted of two modalities, closed and open. In the closed modality, the participants are only allowed to use the training data, obtained the highest F1 score of 96.03{\%} and 92.05{\%} in word segmentation and POS tagging. In the open modality, the participants can use whatever resource they have, with the highest F1 score of 96.34{\%} and 92.56{\%} in word segmentation and POS tagging. The scores on the blind test dataset decrease around 3 points, which shows that the out-of-vocabulary words still are the bottleneck for lexical analyzers.},
address = {Marseille, France},
Expand All @@ -24,19 +34,6 @@ @inproceedings{li-etal-2022-first
year = {2022},
bdsk-url-1 = {https://aclanthology.org/2022.lt4hala-1.19/}}

@inproceedings{YASK:2019,
abstract = {Classical Chinese is an isolating language without notational inflection, and its texts are continuous strings of Chinese characters without spaces or punctuations between words or sentences. In order to apply Universal Dependencies for classical Chinese, we need several ``not-universal'' treatments and enhancements. In this paper such treatments and enhancements are revealed.},
author = {YASUOKA, Koichi},
date-added = {2025-01-11 17:39:18 -0800},
date-modified = {2025-01-11 17:39:18 -0800},
journal = {DADH2019: 10th International Conference of Digital Archives and Digital Humanities},
month = {12},
publisher = {Digital Archives and Digital Humanities},
title = {Universal Dependencies Treebank of the Four Books in Classical Chinese},
url = {http://hdl.handle.net/2433/245217},
year = {2019},
bdsk-url-1 = {http://hdl.handle.net/2433/245217}}

@inproceedings{wang2022uncertainty,
author = {Wang, Pengyu and Ren, Zhichen},
booktitle = {Proceedings of the Second Workshop on Language Technologies for Historical and Ancient Languages},
Expand Down
8 changes: 5 additions & 3 deletions hanlp/pretrained/mtl.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@
CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip'
"ERNIE (:cite:`xiao-etal-2021-ernie`) base version of joint tok, pos, ner, srl, dep, sdp and con model trained on close-source Chinese corpus."
KYOTO_EVAHAN_TOK_LEM_POS_UDEP_LZH = HANLP_URL + 'mtl/kyoto_evahan_tok_lem_pos_udep_bert-ancient-chinese_lr_1_aug_dict_20250112_154422.zip'
'Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese ' \
'Universal Dependencies Treebank (:cite:`yasuoka2019universal`) and EvaHan corpus (:cite:`li-etal-2022-first`). ' \
'Performance: {tok/fine P: 98.91% R: 99.11% F1: 99.01%}{tok/coarse P: 94.71% R: 92.51% F1: 93.60%}{lem Accuracy:98.86%}{pos/upos Accuracy:94.91%}{pos/xpos Accuracy:93.79%}{pos/pku Accuracy:91.91%}{dep UAS: 88.70% LAS: 83.89%}'
'''
Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese
Universal Dependencies Treebank (:cite:`yasuoka2019universal`) and EvaHan corpus (:cite:`li-etal-2022-first`).
Performance: ``{tok/fine P: 98.91% R: 99.11% F1: 99.01%}{tok/coarse P: 94.71% R: 92.51% F1: 93.60%}{lem Accuracy:98.86%}{pos/upos Accuracy:94.91%}{pos/xpos Accuracy:93.79%}{pos/pku Accuracy:91.91%}{dep UAS: 88.70% LAS: 83.89%}``
'''

UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L6 = HANLP_URL + 'mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mMiniLMv2L6_no_space_20220731_161526.zip'
'''
Expand Down
10 changes: 6 additions & 4 deletions hanlp/pretrained/tok.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,12 @@
'which is much higher than that of MTL model '

KYOTO_EVAHAN_TOK_LZH = 'http://download.hanlp.com/tok/extra/kyoto_evahan_tok_bert-ancient-chinese_tau_0.5_20250111_234146.zip'
'Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese ' \
'Universal Dependencies Treebank (:cite:`YASK:2019`) and EvaHan corpus (:cite:`li-etal-2022-first`). ' \
'Performance: {UD P: 98.85% R: 99.00% F1: 98.92%} on UD Kyoto, ' \
'and {TestA P: 95.62% R: 96.56% F1: 96.09%} {TestB P: 94.93% R: 93.05% F1: 93.98%} on EvaHan.'
'''
Ancient Chinese tokenizer with bert-ancient-chinese (:cite:`wang2022uncertainty`) encoder trained on Classical Chinese
Universal Dependencies Treebank (:cite:`yasuoka2019universal`) and EvaHan corpus (:cite:`li-etal-2022-first`).
Performance: ``{UD P: 98.85% R: 99.00% F1: 98.92%}`` on UD Kyoto,
and ``{TestA P: 95.62% R: 96.56% F1: 96.09%} {TestB P: 94.93% R: 93.05% F1: 93.98%}`` on EvaHan.
'''

UD_TOK_MMINILMV2L6 = HANLP_URL + 'tok/ud_tok_mMiniLMv2L6_no_space_mul_20220619_091824.zip'
'''
Expand Down
Empty file.
8 changes: 8 additions & 0 deletions plugins/hanlp_demo/hanlp_demo/lzh/demo_mtl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import hanlp

HanLP = hanlp.load(hanlp.pretrained.mtl.KYOTO_EVAHAN_TOK_LEM_POS_UDEP_LZH)
doc = HanLP(['晋太元中,武陵人捕鱼为业。', '司馬牛問君子'])
print(doc)
doc.pretty_print()

HanLP('司馬牛問君子', skip_tasks='tok/fine').pretty_print()
5 changes: 5 additions & 0 deletions plugins/hanlp_demo/hanlp_demo/lzh/demo_tok.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import hanlp

HanLP = hanlp.load(hanlp.pretrained.tok.KYOTO_EVAHAN_TOK_LZH)
doc = HanLP('司馬牛問君子')
print(doc)

0 comments on commit 0636dc6

Please sign in to comment.