Merge branch 'release/0.2.0'

ckiplab · Dec 23, 2020 · 3bc6a2e · 3bc6a2e
2 parents efdd15d + 6c0e931
commit 3bc6a2e
Show file tree

Hide file tree

Showing 16 changed files with 1,175 additions and 879 deletions.
diff --git a/COPYING b/COPYING
diff --git a/LICENSE b/LICENSE
diff --git a/Makefile b/Makefile
@@ -4,7 +4,7 @@ TWINE = twine
 TOX = tox
 LINT = pylint --rcfile=./.pylintrc
 
-.PHONY: all check dist sdist test tox tox-v tox-report lint doc upload clean
+.PHONY: all check dist sdist test tox tox-v tox-vv tox-report lint doc upload clean
 
 all: dist check test
 
@@ -21,7 +21,7 @@ lint:
 check:
 	$(TWINE) check dist/*
 
-tox tox-v tox-report:
+tox tox-v tox-vv tox-report:
 	( cd test && make $@ )
 
 doc:

diff --git a/README.rst b/README.rst
diff --git a/ckip_transformers/__init__.py b/ckip_transformers/__init__.py
@@ -10,7 +10,7 @@
 __copyright__ = '2020 CKIP Lab'
 
 __title__ = 'CKIP Transformers'
-__version__ = '0.1.0'
+__version__ = '0.2.0'
 __description__ = 'CKIP Transformers'
 __license__ = 'GPL-3.0'
 

diff --git a/ckip_transformers/nlp/driver.py b/ckip_transformers/nlp/driver.py
@@ -11,7 +11,6 @@
 
 from typing import (
     List,
-    Optional,
 )
 
 import numpy as np
@@ -28,32 +27,49 @@ class CkipWordSegmenter(CkipTokenClassification):
 
         Parameters
         ----------
-            model_name : ``str``, *optional*, defaults to ``'ckiplab/bert-base-chinese-ws'``
-                The pretrained model name.
-            tokenizer_name : ``str``, *optional*, defaults to **model_name**
-                The pretrained tokenizer name.
+            level : ``str`` *optional*, defaults to 3, must be 1—3
+                The model level. The higher the level is, the more accurate and slower the model is.
+            device : ``int``, *optional*, defaults to -1,
+                Device ordinal for CPU/GPU supports.
+                Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
     """
 
+    _model_names = {
+        1: 'ckiplab/albert-tiny-chinese-ws',
+        2: 'ckiplab/albert-base-chinese-ws',
+        3: 'ckiplab/bert-base-chinese-ws',
+    }
+
     def __init__(self,
-        model_name: Optional[str] = 'ckiplab/bert-base-chinese-ws',
-        tokenizer_name: Optional[str] = None,
+        level: int = 3,
+        **kwargs,
     ):
-        super().__init__(model_name=model_name, tokenizer_name=tokenizer_name)
+        model_name = kwargs.pop('model_name', self._get_model_name_from_level(level))
+        super().__init__(model_name=model_name, **kwargs)
 
     def __call__(self,
         input_text: List[str],
         *,
-        max_length: Optional[int] = None,
+        use_delim: bool = False,
+        **kwargs,
     ) -> List[List[str]]:
         """Call the driver.
 
         Parameters
         ----------
             input_text : ``List[str]``
                 The input sentences. Each sentence is a string.
+            use_delim : ``bool``, *optional*, defaults to False
+                Segment sentence (internally) using ``delim_set``.
+            delim_set : `str`, *optional*, defaults to ``'，,。：:；;！!？?'``
+                Used for sentence segmentation if ``use_delim=True``.
+            batch_size : ``int``, *optional*, defaults to 256
+                The size of mini-batch.
             max_length : ``int``, *optional*
                 The maximum length of the sentence,
                 must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
+            show_progress : ``int``, *optional*, defaults to True
+                Show progress bar.
 
         Returns
         -------
@@ -63,25 +79,25 @@ def __call__(self,
 
         # Call model
         (
-            loss,
+            logits,
             index_map,
-        ) = super().__call__(input_text, max_length=max_length)
+        ) = super().__call__(input_text, use_delim=use_delim, **kwargs)
 
         # Post-process results
         output_text = []
         for sent_data in zip(input_text, index_map):
             output_sent = []
             word = ''
-            for input_char, loss_index in zip(*sent_data):
-                if loss_index is None:
+            for input_char, logits_index in zip(*sent_data):
+                if logits_index is None:
                     if word:
                         output_sent.append(word)
                     output_sent.append(input_char)
                     word = ''
                 else:
-                    loss_b, loss_i = loss[loss_index]
+                    logits_b, logits_i = logits[logits_index]
 
-                    if loss_b > loss_i:
+                    if logits_b > logits_i:
                         if word:
                             output_sent.append(word)
                         word = input_char
@@ -101,32 +117,49 @@ class CkipPosTagger(CkipTokenClassification):
 
         Parameters
         ----------
-            model_name : ``str``, *optional*, defaults to ``'ckiplab/bert-base-chinese-pos'``
-                The pretrained model name.
-            tokenizer_name : ``str``, *optional*, defaults to **model_name**
-                The pretrained tokenizer name.
+            level : ``str`` *optional*, defaults to 3, must be 1—3
+                The model level. The higher the level is, the more accurate and slower the model is.
+            device : ``int``, *optional*, defaults to -1,
+                Device ordinal for CPU/GPU supports.
+                Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
     """
 
+    _model_names = {
+        1: 'ckiplab/albert-tiny-chinese-pos',
+        2: 'ckiplab/albert-base-chinese-pos',
+        3: 'ckiplab/bert-base-chinese-pos',
+    }
+
     def __init__(self,
-        model_name: Optional[str] = 'ckiplab/bert-base-chinese-pos',
-        tokenizer_name: Optional[str] = None,
+        level: int = 3,
+        **kwargs,
     ):
-        super().__init__(model_name=model_name, tokenizer_name=tokenizer_name)
+        model_name = kwargs.pop('model_name', self._get_model_name_from_level(level))
+        super().__init__(model_name=model_name, **kwargs)
 
     def __call__(self,
         input_text: List[List[str]],
         *,
-        max_length: Optional[int] = None,
+        use_delim: bool = True,
+        **kwargs,
     ) -> List[List[str]]:
         """Call the driver.
 
         Parameters
         ----------
             input_text : ``List[List[str]]``
                 The input sentences. Each sentence is a list of strings (words).
+            use_delim : ``bool``, *optional*, defaults to True
+                Segment sentence (internally) using ``delim_set``.
+            delim_set : `str`, *optional*, defaults to ``'，,。：:；;！!？?'``
+                Used for sentence segmentation if ``use_delim=True``.
+            batch_size : ``int``, *optional*, defaults to 256
+                The size of mini-batch.
             max_length : ``int``, *optional*
                 The maximum length of the sentence,
                 must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
+            show_progress : ``int``, *optional*, defaults to True
+                Show progress bar.
 
         Returns
         -------
@@ -136,9 +169,9 @@ def __call__(self,
 
         # Call model
         (
-            loss,
+            logits,
             index_map,
-        ) = super().__call__(input_text, max_length=max_length)
+        ) = super().__call__(input_text, use_delim=use_delim, **kwargs)
 
         # Get labels
         id2label = self.model.config.id2label
@@ -147,11 +180,11 @@ def __call__(self,
         output_text = []
         for sent_data in zip(input_text, index_map):
             output_sent = []
-            for _, loss_index in zip(*sent_data):
-                if loss_index is None:
+            for input_char, logits_index in zip(*sent_data):
+                if logits_index is None or input_char.isspace():
                     label = 'WHITESPACE'
                 else:
-                    label = id2label[np.argmax(loss[loss_index])]
+                    label = id2label[np.argmax(logits[logits_index])]
                 output_sent.append(label)
             output_text.append(output_sent)
 
@@ -164,32 +197,49 @@ class CkipNerChunker(CkipTokenClassification):
 
         Parameters
         ----------
-            model_name : ``str``, *optional*, defaults to ``'ckiplab/bert-base-chinese-ner'``
-                The pretrained model name.
-            tokenizer_name : ``str``, *optional*, defaults to **model_name**
-                The pretrained tokenizer name.
+            level : ``str`` *optional*, defaults to 3, must be 1—3
+                The model level. The higher the level is, the more accurate and slower the model is.
+            device : ``int``, *optional*, defaults to -1,
+                Device ordinal for CPU/GPU supports.
+                Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
     """
 
+    _model_names = {
+        1: 'ckiplab/albert-tiny-chinese-ner',
+        2: 'ckiplab/albert-base-chinese-ner',
+        3: 'ckiplab/bert-base-chinese-ner',
+    }
+
     def __init__(self,
-        model_name: Optional[str] = 'ckiplab/bert-base-chinese-ner',
-        tokenizer_name: Optional[str] = None,
+        level: int = 3,
+        **kwargs,
     ):
-        super().__init__(model_name=model_name, tokenizer_name=tokenizer_name)
+        model_name = kwargs.pop('model_name', self._get_model_name_from_level(level))
+        super().__init__(model_name=model_name, **kwargs)
 
     def __call__(self,
         input_text: List[str],
         *,
-        max_length: Optional[int] = None,
+        use_delim: bool = False,
+        **kwargs,
     ) -> List[List[NerToken]]:
         """Call the driver.
 
         Parameters
         ----------
             input_text : ``List[str]``
-                The input sentences. Each sentence is a string.
+                The input sentences. Each sentence is a string or a list or string (words).
+            use_delim : ``bool``, *optional*, defaults to False
+                Segment sentence (internally) using ``delim_set``.
+            delim_set : `str`, *optional*, defaults to ``'，,。：:；;！!？?'``
+                Used for sentence segmentation if ``use_delim=True``.
+            batch_size : ``int``, *optional*, defaults to 256
+                The size of mini-batch.
             max_length : ``int``, *optional*
                 The maximum length of the sentence,
                 must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
+            show_progress : ``int``, *optional*, defaults to True
+                Show progress bar.
 
         Returns
         -------
@@ -199,9 +249,9 @@ def __call__(self,
 
         # Call model
         (
-            loss,
+            logits,
             index_map,
-        ) = super().__call__(input_text, max_length=max_length)
+        ) = super().__call__(input_text, use_delim=use_delim, **kwargs)
 
         # Get labels
         id2label = self.model.config.id2label
@@ -213,11 +263,11 @@ def __call__(self,
             entity_word = None
             entity_ner = None
             entity_idx0 = None
-            for index_char, (input_char, loss_index,) in enumerate(zip(*sent_data)):
-                if loss_index is None:
+            for index_char, (input_char, logits_index,) in enumerate(zip(*sent_data)):
+                if logits_index is None:
                     label = 'O'
                 else:
-                    label = id2label[np.argmax(loss[loss_index])]
+                    label = id2label[np.argmax(logits[logits_index])]
 
                 if label == 'O':
                     entity_ner = None
@@ -244,11 +294,11 @@ def __call__(self,
                 elif bioes == 'E':
                     if entity_ner == ner:
                         entity_word += input_char
-                    output_sent.append(NerToken(
-                        word = entity_word,
-                        ner  = entity_ner,
-                        idx  = (entity_idx0, index_char+len(input_char),),
-                    ))
+                        output_sent.append(NerToken(
+                            word = entity_word,
+                            ner  = entity_ner,
+                            idx  = (entity_idx0, index_char+len(input_char),),
+                        ))
                     entity_ner = None
 
             output_text.append(output_sent)