-
Notifications
You must be signed in to change notification settings - Fork 198
/
Copy pathremove_non_chinese_character_mapper.py
47 lines (39 loc) · 1.56 KB
/
remove_non_chinese_character_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import regex as re
from ..base_op import OPERATORS, Mapper
@OPERATORS.register_module('remove_non_chinese_character_mapper')
class RemoveNonChineseCharacterlMapper(Mapper):
"""Mapper to remove non chinese Character in text samples."""
_batched_op = True
def __init__(self,
keep_alphabet: bool = True,
keep_number: bool = True,
keep_punc: bool = True,
*args,
**kwargs):
"""
Initialization method.
:param keep_alphabet: whether to keep alphabet
:param keep_number: whether to keep number
:param keep_punc: whether to keep punctuation
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.pattern = u'[^\u4e00-\u9fa5'
if keep_alphabet:
self.pattern += u'A-Za-z'
if keep_number:
self.pattern += u'0-9'
if keep_punc:
self.pattern += u'., ,\\-。%《*》/•、&&(—)(+):?!!“”·]+'
else:
self.pattern += u']'
def process_batched(self, samples):
for idx, text in enumerate(samples[self.text_key]):
if not re.search(self.pattern, text, flags=re.DOTALL):
continue
samples[self.text_key][idx] = re.sub(pattern=self.pattern,
repl=r'',
string=text,
flags=re.DOTALL)
return samples