From 2a656b884045179efff8f41f4a877e1df7115644 Mon Sep 17 00:00:00 2001 From: Mu Yang Date: Wed, 2 Jun 2021 10:08:33 +0800 Subject: [PATCH 1/3] Update version number. --- ckip_transformers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckip_transformers/__init__.py b/ckip_transformers/__init__.py index 194d4ca..66327bc 100644 --- a/ckip_transformers/__init__.py +++ b/ckip_transformers/__init__.py @@ -10,7 +10,7 @@ __copyright__ = '2020 CKIP Lab' __title__ = 'CKIP Transformers' -__version__ = '0.2.4' +__version__ = '0.2.5' __description__ = 'CKIP Transformers' __license__ = 'GPL-3.0' From 9c17840fe57d01f8039a20e69a607d22d57d7a1c Mon Sep 17 00:00:00 2001 From: Mu Yang Date: Wed, 2 Jun 2021 10:09:09 +0800 Subject: [PATCH 2/3] Update documentation. --- README.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 06ca38d..c90b11d 100644 --- a/README.rst +++ b/README.rst @@ -146,8 +146,9 @@ Model Fine-Tunning | To fine tunning our model on your own datasets, please refer to the following example from HuggingFace's transformers. | 您可參考以下的範例去微調我們的模型於您自己的資料集。 -- https://github.com/huggingface/transformers/tree/master/examples/language-modeling -- https://github.com/huggingface/transformers/tree/master/examples/token-classification +- https://github.com/huggingface/transformers/tree/master/examples +- https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling +- https://github.com/huggingface/transformers/tree/master/examples/pytorch/token-classification | Remember to set ``--tokenizer_name bert-base-chinese`` in order to use Chinese tokenizer. | 記得設置 ``--tokenizer_name bert-base-chinese`` 以正確的使用中文的 tokenizer。 From c28ee9a7041feebcc10d38acddc90a723c11a1fa Mon Sep 17 00:00:00 2001 From: Mu Yang Date: Wed, 2 Jun 2021 18:39:36 +0800 Subject: [PATCH 3/3] Format codes. --- ckip_transformers/nlp/__init__.py | 1 - ckip_transformers/nlp/driver.py | 69 +++++++++++++++-------- ckip_transformers/nlp/util.py | 93 ++++++++++++++++++------------- 3 files changed, 99 insertions(+), 64 deletions(-) diff --git a/ckip_transformers/nlp/__init__.py b/ckip_transformers/nlp/__init__.py index cdaffe2..f1d9551 100644 --- a/ckip_transformers/nlp/__init__.py +++ b/ckip_transformers/nlp/__init__.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 # -*- coding:utf-8 -*- - """ This module provides the CKIP Transformers NLP drivers. """ diff --git a/ckip_transformers/nlp/driver.py b/ckip_transformers/nlp/driver.py index 959a15e..0ac6e58 100644 --- a/ckip_transformers/nlp/driver.py +++ b/ckip_transformers/nlp/driver.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 # -*- coding:utf-8 -*- - """ This module implements the CKIP Transformers NLP drivers. """ @@ -10,8 +9,7 @@ __license__ = 'GPL-3.0' from typing import ( - List, -) + List, ) import numpy as np @@ -22,6 +20,7 @@ ################################################################################################################################ + class CkipWordSegmenter(CkipTokenClassification): """The word segmentation driver. @@ -42,14 +41,17 @@ class CkipWordSegmenter(CkipTokenClassification): 3: 'ckiplab/bert-base-chinese-ws', } - def __init__(self, + def __init__( + self, level: int = 3, **kwargs, ): - model_name = kwargs.pop('model_name', self._get_model_name_from_level(level)) + model_name = kwargs.pop('model_name', + self._get_model_name_from_level(level)) super().__init__(model_name=model_name, **kwargs) - def __call__(self, + def __call__( + self, input_text: List[str], *, use_delim: bool = False, @@ -112,8 +114,10 @@ def __call__(self, return output_text + ################################################################################################################################ + class CkipPosTagger(CkipTokenClassification): """The part-of-speech tagging driver. @@ -134,14 +138,17 @@ class CkipPosTagger(CkipTokenClassification): 3: 'ckiplab/bert-base-chinese-pos', } - def __init__(self, + def __init__( + self, level: int = 3, **kwargs, ): - model_name = kwargs.pop('model_name', self._get_model_name_from_level(level)) + model_name = kwargs.pop('model_name', + self._get_model_name_from_level(level)) super().__init__(model_name=model_name, **kwargs) - def __call__(self, + def __call__( + self, input_text: List[List[str]], *, use_delim: bool = True, @@ -194,8 +201,10 @@ def __call__(self, return output_text + ################################################################################################################################ + class CkipNerChunker(CkipTokenClassification): """The named-entity recognition driver. @@ -216,14 +225,17 @@ class CkipNerChunker(CkipTokenClassification): 3: 'ckiplab/bert-base-chinese-ner', } - def __init__(self, + def __init__( + self, level: int = 3, **kwargs, ): - model_name = kwargs.pop('model_name', self._get_model_name_from_level(level)) + model_name = kwargs.pop('model_name', + self._get_model_name_from_level(level)) super().__init__(model_name=model_name, **kwargs) - def __call__(self, + def __call__( + self, input_text: List[str], *, use_delim: bool = False, @@ -269,7 +281,10 @@ def __call__(self, entity_word = None entity_ner = None entity_idx0 = None - for index_char, (input_char, logits_index,) in enumerate(zip(*sent_data)): + for index_char, ( + input_char, + logits_index, + ) in enumerate(zip(*sent_data)): if logits_index is None: label = 'O' else: @@ -282,11 +297,15 @@ def __call__(self, bioes, ner = label.split('-') if bioes == 'S': - output_sent.append(NerToken( - word = input_char, - ner = ner, - idx = (index_char, index_char+len(input_char),), - )) + output_sent.append( + NerToken( + word=input_char, + ner=ner, + idx=( + index_char, + index_char + len(input_char), + ), + )) entity_ner = None elif bioes == 'B': entity_word = input_char @@ -300,11 +319,15 @@ def __call__(self, elif bioes == 'E': if entity_ner == ner: entity_word += input_char - output_sent.append(NerToken( - word = entity_word, - ner = entity_ner, - idx = (entity_idx0, index_char+len(input_char),), - )) + output_sent.append( + NerToken( + word=entity_word, + ner=entity_ner, + idx=( + entity_idx0, + index_char + len(input_char), + ), + )) entity_ner = None output_text.append(output_sent) diff --git a/ckip_transformers/nlp/util.py b/ckip_transformers/nlp/util.py index 279b32b..4641d08 100644 --- a/ckip_transformers/nlp/util.py +++ b/ckip_transformers/nlp/util.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 # -*- coding:utf-8 -*- - """ This module implements the utilities for CKIP Transformers NLP drivers. """ @@ -9,7 +8,6 @@ __copyright__ = '2020 CKIP Lab' __license__ = 'GPL-3.0' - from abc import ( ABCMeta, abstractmethod, @@ -41,6 +39,7 @@ ################################################################################################################################ + class CkipTokenClassification(metaclass=ABCMeta): """The base class for token classification task. @@ -54,17 +53,19 @@ class CkipTokenClassification(metaclass=ABCMeta): Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id. """ - - def __init__(self, + def __init__( + self, model_name: str, tokenizer_name: Optional[str] = None, *, device: int = -1, ): - self.model = AutoModelForTokenClassification.from_pretrained(model_name) - self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name or model_name) + self.model = AutoModelForTokenClassification.from_pretrained( + model_name) + self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name + or model_name) - self.device = torch.device('cpu' if device < 0 else f'cuda:{device}') # pylint: disable=no-member + self.device = torch.device('cpu' if device < 0 else f'cuda:{device}') # pylint: disable=no-member self.model.to(self.device) ######################################################################################################################## @@ -74,7 +75,8 @@ def __init__(self, def _model_names(cls): return NotImplemented # pragma: no cover - def _get_model_name_from_level(self, + def _get_model_name_from_level( + self, level: int, ): try: @@ -86,7 +88,8 @@ def _get_model_name_from_level(self, ######################################################################################################################## - def __call__(self, + def __call__( + self, input_text: Union[List[str], List[List[str]]], *, use_delim: bool = False, @@ -133,11 +136,10 @@ def __call__(self, if show_progress: input_text = tqdm(input_text, desc='Tokenization') - input_ids_worded = [ - [ - self.tokenizer.convert_tokens_to_ids(list(input_word)) for input_word in input_sent - ] for input_sent in input_text - ] + input_ids_worded = [[ + self.tokenizer.convert_tokens_to_ids(list(input_word)) + for input_word in input_sent + ] for input_sent in input_text] # Flatten input IDs ( @@ -153,9 +155,7 @@ def __call__(self, ( input_ids, attention_mask, - ) = self._pad_input_ids( - input_ids=input_ids, - ) + ) = self._pad_input_ids(input_ids=input_ids, ) # Convert input format encoded_input = BatchEncoding( @@ -183,10 +183,11 @@ def __call__(self, with torch.no_grad(): for batch in dataloader: batch = tuple(tensor.to(self.device) for tensor in batch) - ( - batch_logits, - ) = self.model(**dict(zip(encoded_input.keys(), batch)), return_dict=False) - batch_logits = batch_logits.cpu().numpy()[:, 1:, :] # Remove [CLS] + (batch_logits, ) = self.model(**dict( + zip(encoded_input.keys(), batch)), + return_dict=False) + batch_logits = batch_logits.cpu().numpy( + )[:, 1:, :] # Remove [CLS] logits.append(batch_logits) # Call model @@ -195,7 +196,8 @@ def __call__(self, return logits, index_map @staticmethod - def _find_delim(*, + def _find_delim( + *, input_text, use_delim, delim_set, @@ -208,11 +210,15 @@ def _find_delim(*, for sent_idx, input_sent in enumerate(input_text): for word_idx, input_word in enumerate(input_sent): if input_word in delim_set: - delim_index.add((sent_idx, word_idx,)) + delim_index.add(( + sent_idx, + word_idx, + )) return delim_index @staticmethod - def _flatten_input_ids(*, + def _flatten_input_ids( + *, input_ids_worded, max_length, delim_index, @@ -239,11 +245,14 @@ def _flatten_input_ids(*, # Insert tokens index_map_sent.append(( len(input_ids), # line index - len(input_ids_sent), # token index + len(input_ids_sent), # token index )) input_ids_sent += word_ids - if (sent_idx, word_idx,) in delim_index: + if ( + sent_idx, + word_idx, + ) in delim_index: input_ids.append(input_ids_sent) input_ids_sent = [] @@ -256,7 +265,9 @@ def _flatten_input_ids(*, return input_ids, index_map - def _pad_input_ids(self, *, + def _pad_input_ids( + self, + *, input_ids, ): max_length = max(map(len, input_ids)) @@ -266,22 +277,24 @@ def _pad_input_ids(self, *, for input_ids_sent in input_ids: token_count = len(input_ids_sent) pad_count = max_length - token_count - padded_input_ids.append( - [self.tokenizer.cls_token_id] + - input_ids_sent + - [self.tokenizer.sep_token_id] + - [self.tokenizer.pad_token_id] * pad_count - ) - attention_mask.append( - [1] * (token_count+2) + # [CLS] & input & [SEP] - [0] * pad_count # [PAD]s - ) + padded_input_ids.append([self.tokenizer.cls_token_id] + + input_ids_sent + + [self.tokenizer.sep_token_id] + + [self.tokenizer.pad_token_id] * pad_count) + attention_mask.append([1] * + (token_count + 2) + # [CLS] & input & [SEP] + [0] * pad_count # [PAD]s + ) return padded_input_ids, attention_mask + ################################################################################################################################ + class NerToken(NamedTuple): """A named-entity recognition token.""" - word: str #: ``str``, the token word. - ner: str #: ``str``, the NER-tag. - idx: Tuple[int, int] #: ``Tuple[int, int]``, the starting / ending index in the sentence. + word: str #: ``str``, the token word. + ner: str #: ``str``, the NER-tag. + idx: Tuple[ + int, + int] #: ``Tuple[int, int]``, the starting / ending index in the sentence.