Merge branch 'release/0.2.1'

ckiplab · Jan 5, 2021 · 03af506 · 03af506
2 parents 3bc6a2e + 7052648
commit 03af506
Show file tree

Hide file tree

Showing 10 changed files with 107 additions and 26 deletions.
diff --git a/DEVELOP.md b/DEVELOP.md
@@ -1,5 +1,6 @@
 # Release TODO
 - change version number
+- make sure requirements.txt and test/requirements.txt matches setup.py.
 
 - >> make clean
 - >> make lint
@@ -10,6 +11,3 @@
 - merge to master branch
 - >> make clean
 - >> make upload
-
-# Requirements
-Make sure test/requirements.txt matches setup.py.
diff --git a/README.rst b/README.rst
@@ -138,7 +138,7 @@ Model Usage
 Model Fine-Tunning
 ^^^^^^^^^^^^^^^^^^
 
-| To fine tunning our model on your own datasets, please refer the the following example from HuggingFace's transformers.
+| To fine tunning our model on your own datasets, please refer to the following example from HuggingFace's transformers.
 | 您可參考以下的範例去微調我們的模型於您自己的資料集。
 
 - https://github.com/huggingface/transformers/tree/master/examples/language-modeling
@@ -159,8 +159,8 @@ Model Fine-Tunning
       --tokenizer_name bert-base-chinese \
       ...
 
-Performance
-^^^^^^^^^^^
+Model Performance
+^^^^^^^^^^^^^^^^^
 
 | The following is a performance comparison between our model and other models.
 | The results are tested on a traditional Chinese corpus.
@@ -187,6 +187,71 @@ bert-base-chinese                  2.53        --        --          --
 | ‡ WS: word segmentation; POS: part-of-speech; NER: named-entity recognition; the larger the better.
 | ‡ WS: 斷詞；POS: 詞性標記；NER: 實體辨識；數字越大越好。
 
+Training Corpus
+^^^^^^^^^^^^^^^
+
+| The language models are trained on the ZhWiki and CNA datasets; the WS and POS tasks are trained on the ASBC dataset; the NER tasks are trained on the OntoNotes dataset.
+| 以上的語言模型訓練於 ZhWiki 與 CNA 資料集上；斷詞（WS）與詞性標記（POS）任務模型訓練於 ASBC 資料集上；實體辨識（NER）任務模型訓練於 OntoNotes 資料集上。
+
+* ZhWiki: https://dumps.wikimedia.org/zhwiki/
+   | Chinese Wikipedia text (20200801 dump), translated to Traditional using `OpenCC <https://github.com/BYVoid/OpenCC>`_.
+   | 中文維基的文章（20200801 版本），利用 `OpenCC <https://github.com/BYVoid/OpenCC>`_ 翻譯成繁體中文。
+* CNA: https://catalog.ldc.upenn.edu/LDC2011T13
+   | Chinese Gigaword Fifth Edition — CNA (Central News Agency part).
+   | 中文 Gigaword 第五版 — CNA（中央社）的部分.
+* ASBC: http://asbc.iis.sinica.edu.tw
+   | Academia Sinica Balanced Corpus of Modern Chinese version 4.
+   | 中央研究院漢語平衡語料庫第四版。
+* OntoNotes: https://catalog.ldc.upenn.edu/LDC2013T19
+   | OntoNotes release 5.0, Chinese part, translated to Traditional using `OpenCC <https://github.com/BYVoid/OpenCC>`_.
+   | OntoNotes 第五版，中文部分，利用 `OpenCC <https://github.com/BYVoid/OpenCC>`_ 翻譯成繁體中文。
+
+| Here is a summary of each corpus.
+| 以下是各個資料集的一覽表。
+
+================  ================  ================  ================  ================
+Dataset           #Documents        #Lines            #Characters       Line Type
+================  ================  ================  ================  ================
+CNA               2,559,520         13,532,445        1,219,029,974     Paragraph
+ZhWiki            1,106,783         5,918,975         495,446,829       Paragraph
+ASBC              19,247            1,395,949         17,572,374        Clause
+OntoNotes         1,911             48,067            1,568,491         Sentence
+================  ================  ================  ================  ================
+
+| Here is the dataset split used for language models.
+| 以下是用於訓練語言模型的資料集切割。
+
+================  ================  ================  ================
+CNA+ZhWiki        #Documents        #Lines            #Characters
+================  ================  ================  ================
+Train             3,606,303         18,986,238        4,347,517,682
+Dev               30,000            148,077           32,888,978
+Test              30,000            151,241           35,216,818
+================  ================  ================  ================
+
+| Here is the dataset split used for word segmentation and part-of-speech tagging models.
+| 以下是用於訓練斷詞及詞性標記模型的資料集切割。
+
+================  ================  ================  ================  ================
+ASBC              #Documents        #Lines            #Words            #Characters
+================  ================  ================  ================  ================
+Train             15,247            1,183,260         9,480,899         14,724,250
+Dev               2,000             52,677            448,964           741,323
+Test              2,000             160,012           1,315,129         2,106,799
+================  ================  ================  ================  ================
+
+
+| Here is the dataset split used for word segmentation and named entity recognition models.
+| 以下是用於訓練實體辨識模型的資料集切割。
+
+================  ================  ================  ================  ================
+OntoNotes         #Documents        #Lines            #Characters       #Named-Entities
+================  ================  ================  ================  ================
+Train             1,511             43,362            1,367,658         68,947
+Dev               200               2,304             93,535            7,186
+Test              200               2,401             107,298           6,977
+================  ================  ================  ================  ================
+
 NLP Tools
 ---------
 
@@ -272,7 +337,7 @@ NLP Tools Usage
 
 | The POS driver will automatically segment the sentence internally using there characters ``'，,。：:；;！!？?'`` while running the model. (The output sentences will be concatenated back.) You may set ``delim_set`` to any characters you want.
 | You may set ``use_delim=False`` to disable this feature, or set ``use_delim=True`` in WS and NER driver to enable this feature.
-| 詞性標記工具會自動用 ``'，,。：:；;！!？?'`` 等字元在執行模型前切割句子（輸出的句子會自動接回）。可設定 ``delim_set`` 參數已使用別的字元做切割。
+| 詞性標記工具會自動用 ``'，,。：:；;！!？?'`` 等字元在執行模型前切割句子（輸出的句子會自動接回）。可設定 ``delim_set`` 參數使用別的字元做切割。
 | 另外可指定 ``use_delim=False`` 已停用此功能，或於斷詞、實體辨識時指定 ``use_delim=False`` 已啟用此功能。
 
 .. code-block:: python
@@ -339,8 +404,8 @@ NLP Tools Usage
    空白 也是可以的～
    空白(VH)　 (WHITESPACE)　也(D)　是(SHI)　可以(VH)　的(T)　～(FW)
 
-Performance
-^^^^^^^^^^^
+NLP Tools Performance
+^^^^^^^^^^^^^^^^^^^^^
 
 | The following is a performance comparison between our tool and other tools.
 | 以下是我們的工具與其他的工具之性能比較。
@@ -367,16 +432,19 @@ Level  Tool                        WS (F1)      POS (Acc)      WS+POS (F1)
 CKIP Transformers v.s. CkipTagger
 """"""""""""""""""""""""""""""""""""
 
-| The following results are tested on a different dataset.
-| 以下實驗在另一個資料集測試。
+| The following results are tested on a different dataset.†
+| 以下實驗在另一個資料集測試。†
 
 =====  ========================  ===========  =============  ===============  ============
 Level  Tool                        WS (F1)      POS (Acc)      WS+POS (F1)      NER (F1)
 =====  ========================  ===========  =============  ===============  ============
-3      CKIP BERT Base            **97.84%**     96.46%       **94.91%**         79.20%
---     CkipTagger                  97.33%     **97.20%**       94.75%         **77.87%**
+3      CKIP BERT Base            **97.84%**     96.46%       **94.91%**       **79.20%**
+--     CkipTagger                  97.33%     **97.20%**       94.75%           77.87%
 =====  ========================  ===========  =============  ===============  ============
 
+| † Here we retrained/tested our BERT model using the same dataset with CkipTagger.
+| † 我們重新訓練／測試我們的 BERT 模型於跟 CkipTagger 相同的資料集。
+
 License
 -------
 

diff --git a/ckip_transformers/__init__.py b/ckip_transformers/__init__.py
@@ -10,7 +10,7 @@
 __copyright__ = '2020 CKIP Lab'
 
 __title__ = 'CKIP Transformers'
-__version__ = '0.2.0'
+__version__ = '0.2.1'
 __description__ = 'CKIP Transformers'
 __license__ = 'GPL-3.0'
 

diff --git a/ckip_transformers/nlp/util.py b/ckip_transformers/nlp/util.py
@@ -185,7 +185,7 @@ def __call__(self,
                 batch = tuple(tensor.to(self.device) for tensor in batch)
                 (
                     batch_logits,
-                ) = self.model(**dict(zip(encoded_input.keys(), batch)))
+                ) = self.model(**dict(zip(encoded_input.keys(), batch)), return_dict=False)
                 batch_logits = batch_logits.cpu().numpy()[:, 1:, :]  # Remove [CLS]
                 logits.append(batch_logits)
 

diff --git a/docs/_static/custom.css b/docs/_static/custom.css
@@ -1,3 +1,14 @@
+/* header */
+
+h1 {
+	font-size: 250%;
+
+	display: inline-block;
+	border-bottom: 3px solid #2980b9;
+}
+
+/* content */
+
 .rst-content p.rubric {
 	font-size: 125%;
 }

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
 torch>=1.1.0
 tqdm>=4.27
-transformers>=3.5.0,<4
+transformers>=3.5.0
diff --git a/setup.py b/setup.py
@@ -45,7 +45,7 @@ def main():
         install_requires=[
             'torch>=1.1.0',
             'tqdm>=4.27',
-            'transformers>=3.5.0,<4',
+            'transformers>=3.5.0',
         ],
         data_files=[],
     )

diff --git a/test/Makefile b/test/Makefile
@@ -4,16 +4,16 @@ TOX = tox
 .PHONY: tox tox-v tox-report clean
 
 tox:
-	NO_COV= $(TOX) -p -e py{36,37,38} --
+	NO_COV= $(TOX) -p -e py36-hf{3,4} --
 
 tox-v:
-	NO_COV= $(TOX) -e py{36,37,38} -- -v
+	NO_COV= $(TOX) -e py36-hf{3,4} -- -v
 
 tox-vv:
-	NO_COV= $(TOX) -e py36 -- -vv
+	NO_COV= $(TOX) -e py36-hf4 -- -vv
 
 tox-report:
-	- $(TOX) -p -e clean,py36,report -- --cov-append
+	- $(TOX) -p -e clean,py36-hf4,report -- --cov-append
 	python3.7 -m http.server --directory .test/htmlcov/ 3000
 
 clean:

diff --git a/test/requirements.txt b/test/requirements.txt
@@ -0,0 +1,2 @@
+torch>=1.1.0
+tqdm>=4.27
diff --git a/test/tox.ini b/test/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = clean,py{36,37,38},report
+envlist = clean,py{36,37,38}-hf{3,4},report
 skipsdist = true
 
 [testenv]
@@ -11,11 +11,13 @@ deps =
 	pytest
 	pytest-cov
 	pytest-xdist
-	-r ../requirements.txt
+	-r ./requirements.txt
+	hf3: transformers>=3.5,<4
+	hf4: transformers>=4
 
 depends =
-	py{36,37,38}: clean
-	report: py{36,37,38}
+	py{36,37,38}-hf{3,4}: clean
+	report: py{36,37,38}-hf{3,4}
 
 [testenv:report]
 deps = coverage
@@ -30,6 +32,6 @@ skip_install = true
 commands =
 	coverage erase
 
-[testenv:py{36,37,38}]
+[testenv:py36-hf{3,4}]
 commands =
 	pytest {toxinidir}/script/nlp/run.py {env:NO_COV:--cov=ckip_transformers.nlp} {posargs}