Skip to content

Commit

Permalink
Merge branch 'hotfix/0.2.5'
Browse files Browse the repository at this point in the history
  • Loading branch information
emfomy committed Jun 2, 2021
2 parents d059bd5 + c28ee9a commit 3e1d007
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 67 deletions.
5 changes: 3 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,9 @@ Model Fine-Tunning
| To fine tunning our model on your own datasets, please refer to the following example from HuggingFace's transformers.
| 您可參考以下的範例去微調我們的模型於您自己的資料集。
- https://github.com/huggingface/transformers/tree/master/examples/language-modeling
- https://github.com/huggingface/transformers/tree/master/examples/token-classification
- https://github.com/huggingface/transformers/tree/master/examples
- https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling
- https://github.com/huggingface/transformers/tree/master/examples/pytorch/token-classification

| Remember to set ``--tokenizer_name bert-base-chinese`` in order to use Chinese tokenizer.
| 記得設置 ``--tokenizer_name bert-base-chinese`` 以正確的使用中文的 tokenizer。
Expand Down
2 changes: 1 addition & 1 deletion ckip_transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
__copyright__ = '2020 CKIP Lab'

__title__ = 'CKIP Transformers'
__version__ = '0.2.4'
__version__ = '0.2.5'
__description__ = 'CKIP Transformers'
__license__ = 'GPL-3.0'

Expand Down
1 change: 0 additions & 1 deletion ckip_transformers/nlp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-

"""
This module provides the CKIP Transformers NLP drivers.
"""
Expand Down
69 changes: 46 additions & 23 deletions ckip_transformers/nlp/driver.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-

"""
This module implements the CKIP Transformers NLP drivers.
"""
Expand All @@ -10,8 +9,7 @@
__license__ = 'GPL-3.0'

from typing import (
List,
)
List, )

import numpy as np

Expand All @@ -22,6 +20,7 @@

################################################################################################################################


class CkipWordSegmenter(CkipTokenClassification):
"""The word segmentation driver.
Expand All @@ -42,14 +41,17 @@ class CkipWordSegmenter(CkipTokenClassification):
3: 'ckiplab/bert-base-chinese-ws',
}

def __init__(self,
def __init__(
self,
level: int = 3,
**kwargs,
):
model_name = kwargs.pop('model_name', self._get_model_name_from_level(level))
model_name = kwargs.pop('model_name',
self._get_model_name_from_level(level))
super().__init__(model_name=model_name, **kwargs)

def __call__(self,
def __call__(
self,
input_text: List[str],
*,
use_delim: bool = False,
Expand Down Expand Up @@ -112,8 +114,10 @@ def __call__(self,

return output_text


################################################################################################################################


class CkipPosTagger(CkipTokenClassification):
"""The part-of-speech tagging driver.
Expand All @@ -134,14 +138,17 @@ class CkipPosTagger(CkipTokenClassification):
3: 'ckiplab/bert-base-chinese-pos',
}

def __init__(self,
def __init__(
self,
level: int = 3,
**kwargs,
):
model_name = kwargs.pop('model_name', self._get_model_name_from_level(level))
model_name = kwargs.pop('model_name',
self._get_model_name_from_level(level))
super().__init__(model_name=model_name, **kwargs)

def __call__(self,
def __call__(
self,
input_text: List[List[str]],
*,
use_delim: bool = True,
Expand Down Expand Up @@ -194,8 +201,10 @@ def __call__(self,

return output_text


################################################################################################################################


class CkipNerChunker(CkipTokenClassification):
"""The named-entity recognition driver.
Expand All @@ -216,14 +225,17 @@ class CkipNerChunker(CkipTokenClassification):
3: 'ckiplab/bert-base-chinese-ner',
}

def __init__(self,
def __init__(
self,
level: int = 3,
**kwargs,
):
model_name = kwargs.pop('model_name', self._get_model_name_from_level(level))
model_name = kwargs.pop('model_name',
self._get_model_name_from_level(level))
super().__init__(model_name=model_name, **kwargs)

def __call__(self,
def __call__(
self,
input_text: List[str],
*,
use_delim: bool = False,
Expand Down Expand Up @@ -269,7 +281,10 @@ def __call__(self,
entity_word = None
entity_ner = None
entity_idx0 = None
for index_char, (input_char, logits_index,) in enumerate(zip(*sent_data)):
for index_char, (
input_char,
logits_index,
) in enumerate(zip(*sent_data)):
if logits_index is None:
label = 'O'
else:
Expand All @@ -282,11 +297,15 @@ def __call__(self,
bioes, ner = label.split('-')

if bioes == 'S':
output_sent.append(NerToken(
word = input_char,
ner = ner,
idx = (index_char, index_char+len(input_char),),
))
output_sent.append(
NerToken(
word=input_char,
ner=ner,
idx=(
index_char,
index_char + len(input_char),
),
))
entity_ner = None
elif bioes == 'B':
entity_word = input_char
Expand All @@ -300,11 +319,15 @@ def __call__(self,
elif bioes == 'E':
if entity_ner == ner:
entity_word += input_char
output_sent.append(NerToken(
word = entity_word,
ner = entity_ner,
idx = (entity_idx0, index_char+len(input_char),),
))
output_sent.append(
NerToken(
word=entity_word,
ner=entity_ner,
idx=(
entity_idx0,
index_char + len(input_char),
),
))
entity_ner = None

output_text.append(output_sent)
Expand Down
93 changes: 53 additions & 40 deletions ckip_transformers/nlp/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-

"""
This module implements the utilities for CKIP Transformers NLP drivers.
"""
Expand All @@ -9,7 +8,6 @@
__copyright__ = '2020 CKIP Lab'
__license__ = 'GPL-3.0'


from abc import (
ABCMeta,
abstractmethod,
Expand Down Expand Up @@ -41,6 +39,7 @@

################################################################################################################################


class CkipTokenClassification(metaclass=ABCMeta):
"""The base class for token classification task.
Expand All @@ -54,17 +53,19 @@ class CkipTokenClassification(metaclass=ABCMeta):
Device ordinal for CPU/GPU supports.
Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
"""

def __init__(self,
def __init__(
self,
model_name: str,
tokenizer_name: Optional[str] = None,
*,
device: int = -1,
):
self.model = AutoModelForTokenClassification.from_pretrained(model_name)
self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name or model_name)
self.model = AutoModelForTokenClassification.from_pretrained(
model_name)
self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name
or model_name)

self.device = torch.device('cpu' if device < 0 else f'cuda:{device}') # pylint: disable=no-member
self.device = torch.device('cpu' if device < 0 else f'cuda:{device}') # pylint: disable=no-member
self.model.to(self.device)

########################################################################################################################
Expand All @@ -74,7 +75,8 @@ def __init__(self,
def _model_names(cls):
return NotImplemented # pragma: no cover

def _get_model_name_from_level(self,
def _get_model_name_from_level(
self,
level: int,
):
try:
Expand All @@ -86,7 +88,8 @@ def _get_model_name_from_level(self,

########################################################################################################################

def __call__(self,
def __call__(
self,
input_text: Union[List[str], List[List[str]]],
*,
use_delim: bool = False,
Expand Down Expand Up @@ -133,11 +136,10 @@ def __call__(self,
if show_progress:
input_text = tqdm(input_text, desc='Tokenization')

input_ids_worded = [
[
self.tokenizer.convert_tokens_to_ids(list(input_word)) for input_word in input_sent
] for input_sent in input_text
]
input_ids_worded = [[
self.tokenizer.convert_tokens_to_ids(list(input_word))
for input_word in input_sent
] for input_sent in input_text]

# Flatten input IDs
(
Expand All @@ -153,9 +155,7 @@ def __call__(self,
(
input_ids,
attention_mask,
) = self._pad_input_ids(
input_ids=input_ids,
)
) = self._pad_input_ids(input_ids=input_ids, )

# Convert input format
encoded_input = BatchEncoding(
Expand Down Expand Up @@ -183,10 +183,11 @@ def __call__(self,
with torch.no_grad():
for batch in dataloader:
batch = tuple(tensor.to(self.device) for tensor in batch)
(
batch_logits,
) = self.model(**dict(zip(encoded_input.keys(), batch)), return_dict=False)
batch_logits = batch_logits.cpu().numpy()[:, 1:, :] # Remove [CLS]
(batch_logits, ) = self.model(**dict(
zip(encoded_input.keys(), batch)),
return_dict=False)
batch_logits = batch_logits.cpu().numpy(
)[:, 1:, :] # Remove [CLS]
logits.append(batch_logits)

# Call model
Expand All @@ -195,7 +196,8 @@ def __call__(self,
return logits, index_map

@staticmethod
def _find_delim(*,
def _find_delim(
*,
input_text,
use_delim,
delim_set,
Expand All @@ -208,11 +210,15 @@ def _find_delim(*,
for sent_idx, input_sent in enumerate(input_text):
for word_idx, input_word in enumerate(input_sent):
if input_word in delim_set:
delim_index.add((sent_idx, word_idx,))
delim_index.add((
sent_idx,
word_idx,
))
return delim_index

@staticmethod
def _flatten_input_ids(*,
def _flatten_input_ids(
*,
input_ids_worded,
max_length,
delim_index,
Expand All @@ -239,11 +245,14 @@ def _flatten_input_ids(*,
# Insert tokens
index_map_sent.append((
len(input_ids), # line index
len(input_ids_sent), # token index
len(input_ids_sent), # token index
))
input_ids_sent += word_ids

if (sent_idx, word_idx,) in delim_index:
if (
sent_idx,
word_idx,
) in delim_index:
input_ids.append(input_ids_sent)
input_ids_sent = []

Expand All @@ -256,7 +265,9 @@ def _flatten_input_ids(*,

return input_ids, index_map

def _pad_input_ids(self, *,
def _pad_input_ids(
self,
*,
input_ids,
):
max_length = max(map(len, input_ids))
Expand All @@ -266,22 +277,24 @@ def _pad_input_ids(self, *,
for input_ids_sent in input_ids:
token_count = len(input_ids_sent)
pad_count = max_length - token_count
padded_input_ids.append(
[self.tokenizer.cls_token_id] +
input_ids_sent +
[self.tokenizer.sep_token_id] +
[self.tokenizer.pad_token_id] * pad_count
)
attention_mask.append(
[1] * (token_count+2) + # [CLS] & input & [SEP]
[0] * pad_count # [PAD]s
)
padded_input_ids.append([self.tokenizer.cls_token_id] +
input_ids_sent +
[self.tokenizer.sep_token_id] +
[self.tokenizer.pad_token_id] * pad_count)
attention_mask.append([1] *
(token_count + 2) + # [CLS] & input & [SEP]
[0] * pad_count # [PAD]s
)
return padded_input_ids, attention_mask


################################################################################################################################


class NerToken(NamedTuple):
"""A named-entity recognition token."""
word: str #: ``str``, the token word.
ner: str #: ``str``, the NER-tag.
idx: Tuple[int, int] #: ``Tuple[int, int]``, the starting / ending index in the sentence.
word: str #: ``str``, the token word.
ner: str #: ``str``, the NER-tag.
idx: Tuple[
int,
int] #: ``Tuple[int, int]``, the starting / ending index in the sentence.

0 comments on commit 3e1d007

Please sign in to comment.