Skip to content

Commit

Permalink
Merge branch 'release/0.2.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
emfomy committed Dec 23, 2020
2 parents efdd15d + 6c0e931 commit 3bc6a2e
Show file tree
Hide file tree
Showing 16 changed files with 1,175 additions and 879 deletions.
674 changes: 0 additions & 674 deletions COPYING

This file was deleted.

682 changes: 671 additions & 11 deletions LICENSE

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ TWINE = twine
TOX = tox
LINT = pylint --rcfile=./.pylintrc

.PHONY: all check dist sdist test tox tox-v tox-report lint doc upload clean
.PHONY: all check dist sdist test tox tox-v tox-vv tox-report lint doc upload clean

all: dist check test

Expand All @@ -21,7 +21,7 @@ lint:
check:
$(TWINE) check dist/*

tox tox-v tox-report:
tox tox-v tox-vv tox-report:
( cd test && make $@ )

doc:
Expand Down
328 changes: 252 additions & 76 deletions README.rst

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion ckip_transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
__copyright__ = '2020 CKIP Lab'

__title__ = 'CKIP Transformers'
__version__ = '0.1.0'
__version__ = '0.2.0'
__description__ = 'CKIP Transformers'
__license__ = 'GPL-3.0'

Expand Down
144 changes: 97 additions & 47 deletions ckip_transformers/nlp/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

from typing import (
List,
Optional,
)

import numpy as np
Expand All @@ -28,32 +27,49 @@ class CkipWordSegmenter(CkipTokenClassification):
Parameters
----------
model_name : ``str``, *optional*, defaults to ``'ckiplab/bert-base-chinese-ws'``
The pretrained model name.
tokenizer_name : ``str``, *optional*, defaults to **model_name**
The pretrained tokenizer name.
level : ``str`` *optional*, defaults to 3, must be 1—3
The model level. The higher the level is, the more accurate and slower the model is.
device : ``int``, *optional*, defaults to -1,
Device ordinal for CPU/GPU supports.
Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
"""

_model_names = {
1: 'ckiplab/albert-tiny-chinese-ws',
2: 'ckiplab/albert-base-chinese-ws',
3: 'ckiplab/bert-base-chinese-ws',
}

def __init__(self,
model_name: Optional[str] = 'ckiplab/bert-base-chinese-ws',
tokenizer_name: Optional[str] = None,
level: int = 3,
**kwargs,
):
super().__init__(model_name=model_name, tokenizer_name=tokenizer_name)
model_name = kwargs.pop('model_name', self._get_model_name_from_level(level))
super().__init__(model_name=model_name, **kwargs)

def __call__(self,
input_text: List[str],
*,
max_length: Optional[int] = None,
use_delim: bool = False,
**kwargs,
) -> List[List[str]]:
"""Call the driver.
Parameters
----------
input_text : ``List[str]``
The input sentences. Each sentence is a string.
use_delim : ``bool``, *optional*, defaults to False
Segment sentence (internally) using ``delim_set``.
delim_set : `str`, *optional*, defaults to ``',,。::;;!!??'``
Used for sentence segmentation if ``use_delim=True``.
batch_size : ``int``, *optional*, defaults to 256
The size of mini-batch.
max_length : ``int``, *optional*
The maximum length of the sentence,
must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
show_progress : ``int``, *optional*, defaults to True
Show progress bar.
Returns
-------
Expand All @@ -63,25 +79,25 @@ def __call__(self,

# Call model
(
loss,
logits,
index_map,
) = super().__call__(input_text, max_length=max_length)
) = super().__call__(input_text, use_delim=use_delim, **kwargs)

# Post-process results
output_text = []
for sent_data in zip(input_text, index_map):
output_sent = []
word = ''
for input_char, loss_index in zip(*sent_data):
if loss_index is None:
for input_char, logits_index in zip(*sent_data):
if logits_index is None:
if word:
output_sent.append(word)
output_sent.append(input_char)
word = ''
else:
loss_b, loss_i = loss[loss_index]
logits_b, logits_i = logits[logits_index]

if loss_b > loss_i:
if logits_b > logits_i:
if word:
output_sent.append(word)
word = input_char
Expand All @@ -101,32 +117,49 @@ class CkipPosTagger(CkipTokenClassification):
Parameters
----------
model_name : ``str``, *optional*, defaults to ``'ckiplab/bert-base-chinese-pos'``
The pretrained model name.
tokenizer_name : ``str``, *optional*, defaults to **model_name**
The pretrained tokenizer name.
level : ``str`` *optional*, defaults to 3, must be 1—3
The model level. The higher the level is, the more accurate and slower the model is.
device : ``int``, *optional*, defaults to -1,
Device ordinal for CPU/GPU supports.
Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
"""

_model_names = {
1: 'ckiplab/albert-tiny-chinese-pos',
2: 'ckiplab/albert-base-chinese-pos',
3: 'ckiplab/bert-base-chinese-pos',
}

def __init__(self,
model_name: Optional[str] = 'ckiplab/bert-base-chinese-pos',
tokenizer_name: Optional[str] = None,
level: int = 3,
**kwargs,
):
super().__init__(model_name=model_name, tokenizer_name=tokenizer_name)
model_name = kwargs.pop('model_name', self._get_model_name_from_level(level))
super().__init__(model_name=model_name, **kwargs)

def __call__(self,
input_text: List[List[str]],
*,
max_length: Optional[int] = None,
use_delim: bool = True,
**kwargs,
) -> List[List[str]]:
"""Call the driver.
Parameters
----------
input_text : ``List[List[str]]``
The input sentences. Each sentence is a list of strings (words).
use_delim : ``bool``, *optional*, defaults to True
Segment sentence (internally) using ``delim_set``.
delim_set : `str`, *optional*, defaults to ``',,。::;;!!??'``
Used for sentence segmentation if ``use_delim=True``.
batch_size : ``int``, *optional*, defaults to 256
The size of mini-batch.
max_length : ``int``, *optional*
The maximum length of the sentence,
must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
show_progress : ``int``, *optional*, defaults to True
Show progress bar.
Returns
-------
Expand All @@ -136,9 +169,9 @@ def __call__(self,

# Call model
(
loss,
logits,
index_map,
) = super().__call__(input_text, max_length=max_length)
) = super().__call__(input_text, use_delim=use_delim, **kwargs)

# Get labels
id2label = self.model.config.id2label
Expand All @@ -147,11 +180,11 @@ def __call__(self,
output_text = []
for sent_data in zip(input_text, index_map):
output_sent = []
for _, loss_index in zip(*sent_data):
if loss_index is None:
for input_char, logits_index in zip(*sent_data):
if logits_index is None or input_char.isspace():
label = 'WHITESPACE'
else:
label = id2label[np.argmax(loss[loss_index])]
label = id2label[np.argmax(logits[logits_index])]
output_sent.append(label)
output_text.append(output_sent)

Expand All @@ -164,32 +197,49 @@ class CkipNerChunker(CkipTokenClassification):
Parameters
----------
model_name : ``str``, *optional*, defaults to ``'ckiplab/bert-base-chinese-ner'``
The pretrained model name.
tokenizer_name : ``str``, *optional*, defaults to **model_name**
The pretrained tokenizer name.
level : ``str`` *optional*, defaults to 3, must be 1—3
The model level. The higher the level is, the more accurate and slower the model is.
device : ``int``, *optional*, defaults to -1,
Device ordinal for CPU/GPU supports.
Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
"""

_model_names = {
1: 'ckiplab/albert-tiny-chinese-ner',
2: 'ckiplab/albert-base-chinese-ner',
3: 'ckiplab/bert-base-chinese-ner',
}

def __init__(self,
model_name: Optional[str] = 'ckiplab/bert-base-chinese-ner',
tokenizer_name: Optional[str] = None,
level: int = 3,
**kwargs,
):
super().__init__(model_name=model_name, tokenizer_name=tokenizer_name)
model_name = kwargs.pop('model_name', self._get_model_name_from_level(level))
super().__init__(model_name=model_name, **kwargs)

def __call__(self,
input_text: List[str],
*,
max_length: Optional[int] = None,
use_delim: bool = False,
**kwargs,
) -> List[List[NerToken]]:
"""Call the driver.
Parameters
----------
input_text : ``List[str]``
The input sentences. Each sentence is a string.
The input sentences. Each sentence is a string or a list or string (words).
use_delim : ``bool``, *optional*, defaults to False
Segment sentence (internally) using ``delim_set``.
delim_set : `str`, *optional*, defaults to ``',,。::;;!!??'``
Used for sentence segmentation if ``use_delim=True``.
batch_size : ``int``, *optional*, defaults to 256
The size of mini-batch.
max_length : ``int``, *optional*
The maximum length of the sentence,
must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
show_progress : ``int``, *optional*, defaults to True
Show progress bar.
Returns
-------
Expand All @@ -199,9 +249,9 @@ def __call__(self,

# Call model
(
loss,
logits,
index_map,
) = super().__call__(input_text, max_length=max_length)
) = super().__call__(input_text, use_delim=use_delim, **kwargs)

# Get labels
id2label = self.model.config.id2label
Expand All @@ -213,11 +263,11 @@ def __call__(self,
entity_word = None
entity_ner = None
entity_idx0 = None
for index_char, (input_char, loss_index,) in enumerate(zip(*sent_data)):
if loss_index is None:
for index_char, (input_char, logits_index,) in enumerate(zip(*sent_data)):
if logits_index is None:
label = 'O'
else:
label = id2label[np.argmax(loss[loss_index])]
label = id2label[np.argmax(logits[logits_index])]

if label == 'O':
entity_ner = None
Expand All @@ -244,11 +294,11 @@ def __call__(self,
elif bioes == 'E':
if entity_ner == ner:
entity_word += input_char
output_sent.append(NerToken(
word = entity_word,
ner = entity_ner,
idx = (entity_idx0, index_char+len(input_char),),
))
output_sent.append(NerToken(
word = entity_word,
ner = entity_ner,
idx = (entity_idx0, index_char+len(input_char),),
))
entity_ner = None

output_text.append(output_sent)
Expand Down
Loading

0 comments on commit 3bc6a2e

Please sign in to comment.