Skip to content

Commit dd63ea5

Browse files
committed
Added multi-language support to the tokenizer
Signed-off-by: shankeleven <[email protected]>
1 parent c8ef723 commit dd63ea5

File tree

5 files changed

+177
-28
lines changed

5 files changed

+177
-28
lines changed

langextract/chunking.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@
2222

2323
from collections.abc import Iterable, Iterator, Sequence
2424
import dataclasses
25-
import re
2625

2726
from absl import logging
2827
import more_itertools
28+
import regex as re
2929

3030
from langextract import data
3131
from langextract import exceptions

langextract/tokenizer.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@
2626
from collections.abc import Sequence, Set
2727
import dataclasses
2828
import enum
29-
import re
3029

3130
from absl import logging
31+
import regex as re
3232

3333
from langextract import exceptions
3434

@@ -133,11 +133,11 @@ class TokenizedText:
133133

134134

135135
# Regex patterns for tokenization.
136-
_LETTERS_PATTERN = r"[A-Za-z]+"
137-
_DIGITS_PATTERN = r"[0-9]+"
138-
_SYMBOLS_PATTERN = r"[^A-Za-z0-9\s]+"
136+
_LETTERS_PATTERN = r"\p{L}+"
137+
_DIGITS_PATTERN = r"\p{N}+"
138+
_SYMBOLS_PATTERN = r"[^\p{L}\p{N}\s]+"
139139
_END_OF_SENTENCE_PATTERN = re.compile(r"[.?!]$")
140-
_SLASH_ABBREV_PATTERN = r"[A-Za-z0-9]+(?:/[A-Za-z0-9]+)+"
140+
_SLASH_ABBREV_PATTERN = r"(?:{_LETTERS_PATTERN}|{_DIGITS_PATTERN})(?:/(?:{_LETTERS_PATTERN}|{_DIGITS_PATTERN}))+"
141141

142142
_TOKEN_PATTERN = re.compile(
143143
rf"{_SLASH_ABBREV_PATTERN}|{_LETTERS_PATTERN}|{_DIGITS_PATTERN}|{_SYMBOLS_PATTERN}"

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ dependencies = [
4040
"pydantic>=1.8.0",
4141
"python-dotenv>=0.19.0",
4242
"PyYAML>=6.0",
43+
"regex>=2022.1.18"
4344
"requests>=2.25.0",
4445
"tqdm>=4.64.0",
4546
"typing-extensions>=4.0.0"

tests/chunking_test.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -307,25 +307,25 @@ class BatchingTest(parameterized.TestCase):
307307
),
308308
chunking.TextChunk(
309309
token_interval=tokenizer.TokenInterval(
310-
start_index=7, end_index=8
310+
start_index=7, end_index=10
311311
),
312312
document=_SAMPLE_DOCUMENT,
313313
),
314314
chunking.TextChunk(
315315
token_interval=tokenizer.TokenInterval(
316-
start_index=8, end_index=12
316+
start_index=10, end_index=14
317317
),
318318
document=_SAMPLE_DOCUMENT,
319319
),
320320
chunking.TextChunk(
321321
token_interval=tokenizer.TokenInterval(
322-
start_index=12, end_index=17
322+
start_index=14, end_index=19
323323
),
324324
document=_SAMPLE_DOCUMENT,
325325
),
326326
chunking.TextChunk(
327327
token_interval=tokenizer.TokenInterval(
328-
start_index=17, end_index=20
328+
start_index=19, end_index=22
329329
),
330330
document=_SAMPLE_DOCUMENT,
331331
),

tests/tokenizer_test.py

Lines changed: 166 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def assertTokenListEqual(self, actual_tokens, expected_tokens, msg=None):
3535
token_type=actual.token_type,
3636
first_token_after_newline=actual.first_token_after_newline,
3737
)
38-
self.assertDataclassEqual(
38+
self.assertEqual(
3939
expected,
4040
actual,
4141
msg=f"Token mismatch at index {i}",
@@ -117,6 +117,166 @@ def assertTokenListEqual(self, actual_tokens, expected_tokens, msg=None):
117117
input_text="",
118118
expected_tokens=[],
119119
),
120+
dict(
121+
testcase_name="numbers_with_slash",
122+
input_text="Patient BP was 120/80 mmHg.",
123+
expected_tokens=[
124+
tokenizer.Token(index=0, token_type=tokenizer.TokenType.WORD),
125+
tokenizer.Token(index=1, token_type=tokenizer.TokenType.WORD),
126+
tokenizer.Token(index=2, token_type=tokenizer.TokenType.WORD),
127+
tokenizer.Token(index=3, token_type=tokenizer.TokenType.NUMBER),
128+
tokenizer.Token(
129+
index=4, token_type=tokenizer.TokenType.PUNCTUATION
130+
),
131+
tokenizer.Token(index=5, token_type=tokenizer.TokenType.NUMBER),
132+
tokenizer.Token(index=6, token_type=tokenizer.TokenType.WORD),
133+
tokenizer.Token(
134+
index=7, token_type=tokenizer.TokenType.PUNCTUATION
135+
),
136+
],
137+
),
138+
dict(
139+
testcase_name="decimals_and_alphanum_units",
140+
input_text="Temp 98.6°F and dosage 50mg daily.",
141+
expected_tokens=[
142+
tokenizer.Token(
143+
index=0, token_type=tokenizer.TokenType.WORD
144+
), # Temp
145+
tokenizer.Token(
146+
index=1, token_type=tokenizer.TokenType.NUMBER
147+
), # 98
148+
tokenizer.Token(
149+
index=2, token_type=tokenizer.TokenType.PUNCTUATION
150+
), # .
151+
tokenizer.Token(
152+
index=3, token_type=tokenizer.TokenType.NUMBER
153+
), # 6
154+
tokenizer.Token(
155+
index=4, token_type=tokenizer.TokenType.PUNCTUATION
156+
), # °
157+
tokenizer.Token(
158+
index=5, token_type=tokenizer.TokenType.WORD
159+
), # F
160+
tokenizer.Token(
161+
index=6, token_type=tokenizer.TokenType.WORD
162+
), # and
163+
tokenizer.Token(
164+
index=7, token_type=tokenizer.TokenType.WORD
165+
), # dosage
166+
tokenizer.Token(
167+
index=8, token_type=tokenizer.TokenType.NUMBER
168+
), # 50
169+
tokenizer.Token(
170+
index=9, token_type=tokenizer.TokenType.WORD
171+
), # mg
172+
tokenizer.Token(
173+
index=10, token_type=tokenizer.TokenType.WORD
174+
), # daily
175+
tokenizer.Token(
176+
index=11, token_type=tokenizer.TokenType.PUNCTUATION
177+
), # .
178+
],
179+
),
180+
dict(
181+
testcase_name="japanese_text",
182+
input_text="これはテストです。",
183+
expected_tokens=[
184+
tokenizer.Token(index=0, token_type=tokenizer.TokenType.WORD),
185+
tokenizer.Token(
186+
index=1, token_type=tokenizer.TokenType.PUNCTUATION
187+
),
188+
],
189+
),
190+
dict(
191+
testcase_name="cjk_slash_abbreviation",
192+
input_text="患者の血圧は120/80です。",
193+
expected_tokens=[
194+
tokenizer.Token(index=0, token_type=tokenizer.TokenType.WORD),
195+
tokenizer.Token(index=1, token_type=tokenizer.TokenType.NUMBER),
196+
tokenizer.Token(
197+
index=2, token_type=tokenizer.TokenType.PUNCTUATION
198+
),
199+
tokenizer.Token(index=3, token_type=tokenizer.TokenType.NUMBER),
200+
tokenizer.Token(index=4, token_type=tokenizer.TokenType.WORD),
201+
tokenizer.Token(
202+
index=5, token_type=tokenizer.TokenType.PUNCTUATION
203+
),
204+
],
205+
),
206+
dict(
207+
testcase_name="devanagari_hindi_split",
208+
input_text="नमस्ते दुनिया, मेरा स्कोर १००/१०० है।",
209+
expected_tokens=[
210+
tokenizer.Token(index=0, token_type=tokenizer.TokenType.WORD),
211+
tokenizer.Token(
212+
index=1, token_type=tokenizer.TokenType.PUNCTUATION
213+
),
214+
tokenizer.Token(index=2, token_type=tokenizer.TokenType.WORD),
215+
tokenizer.Token(
216+
index=3, token_type=tokenizer.TokenType.PUNCTUATION
217+
),
218+
tokenizer.Token(index=4, token_type=tokenizer.TokenType.WORD),
219+
tokenizer.Token(
220+
index=5, token_type=tokenizer.TokenType.PUNCTUATION
221+
),
222+
tokenizer.Token(index=6, token_type=tokenizer.TokenType.WORD),
223+
tokenizer.Token(
224+
index=7, token_type=tokenizer.TokenType.PUNCTUATION
225+
),
226+
tokenizer.Token(index=8, token_type=tokenizer.TokenType.WORD),
227+
tokenizer.Token(
228+
index=9, token_type=tokenizer.TokenType.PUNCTUATION
229+
),
230+
tokenizer.Token(index=10, token_type=tokenizer.TokenType.WORD),
231+
tokenizer.Token(
232+
index=11, token_type=tokenizer.TokenType.PUNCTUATION
233+
),
234+
tokenizer.Token(index=12, token_type=tokenizer.TokenType.WORD),
235+
tokenizer.Token(
236+
index=13, token_type=tokenizer.TokenType.PUNCTUATION
237+
),
238+
tokenizer.Token(index=14, token_type=tokenizer.TokenType.WORD),
239+
tokenizer.Token(
240+
index=15, token_type=tokenizer.TokenType.PUNCTUATION
241+
),
242+
tokenizer.Token(index=16, token_type=tokenizer.TokenType.WORD),
243+
tokenizer.Token(
244+
index=17, token_type=tokenizer.TokenType.PUNCTUATION
245+
),
246+
tokenizer.Token(index=18, token_type=tokenizer.TokenType.WORD),
247+
tokenizer.Token(index=19, token_type=tokenizer.TokenType.NUMBER),
248+
tokenizer.Token(
249+
index=20, token_type=tokenizer.TokenType.PUNCTUATION
250+
),
251+
tokenizer.Token(index=21, token_type=tokenizer.TokenType.NUMBER),
252+
tokenizer.Token(index=22, token_type=tokenizer.TokenType.WORD),
253+
tokenizer.Token(
254+
index=23, token_type=tokenizer.TokenType.PUNCTUATION
255+
),
256+
],
257+
),
258+
dict(
259+
testcase_name="arabic_text",
260+
input_text="مرحبا بالعالم! درجة الحرارة ٢٥.٥ درجة.",
261+
expected_tokens=[
262+
tokenizer.Token(index=0, token_type=tokenizer.TokenType.WORD),
263+
tokenizer.Token(index=1, token_type=tokenizer.TokenType.WORD),
264+
tokenizer.Token(
265+
index=2, token_type=tokenizer.TokenType.PUNCTUATION
266+
),
267+
tokenizer.Token(index=3, token_type=tokenizer.TokenType.WORD),
268+
tokenizer.Token(index=4, token_type=tokenizer.TokenType.WORD),
269+
tokenizer.Token(index=5, token_type=tokenizer.TokenType.NUMBER),
270+
tokenizer.Token(
271+
index=6, token_type=tokenizer.TokenType.PUNCTUATION
272+
),
273+
tokenizer.Token(index=7, token_type=tokenizer.TokenType.NUMBER),
274+
tokenizer.Token(index=8, token_type=tokenizer.TokenType.WORD),
275+
tokenizer.Token(
276+
index=9, token_type=tokenizer.TokenType.PUNCTUATION
277+
),
278+
],
279+
),
120280
)
121281
def test_tokenize_various_inputs(self, input_text, expected_tokens):
122282
tokenized = tokenizer.tokenize(input_text)
@@ -131,32 +291,20 @@ def test_first_token_after_newline_flag(self):
131291
tokenized = tokenizer.tokenize(input_text)
132292

133293
expected_tokens = [
134-
tokenizer.Token(
135-
index=0,
136-
token_type=tokenizer.TokenType.WORD,
137-
),
138-
tokenizer.Token(
139-
index=1,
140-
token_type=tokenizer.TokenType.NUMBER,
141-
),
294+
tokenizer.Token(index=0, token_type=tokenizer.TokenType.WORD),
295+
tokenizer.Token(index=1, token_type=tokenizer.TokenType.NUMBER),
142296
tokenizer.Token(
143297
index=2,
144298
token_type=tokenizer.TokenType.WORD,
145299
first_token_after_newline=True,
146300
),
147-
tokenizer.Token(
148-
index=3,
149-
token_type=tokenizer.TokenType.NUMBER,
150-
),
301+
tokenizer.Token(index=3, token_type=tokenizer.TokenType.NUMBER),
151302
tokenizer.Token(
152303
index=4,
153304
token_type=tokenizer.TokenType.WORD,
154305
first_token_after_newline=True,
155306
),
156-
tokenizer.Token(
157-
index=5,
158-
token_type=tokenizer.TokenType.NUMBER,
159-
),
307+
tokenizer.Token(index=5, token_type=tokenizer.TokenType.NUMBER),
160308
]
161309

162310
self.assertTokenListEqual(
@@ -257,7 +405,7 @@ class SentenceRangeTest(parameterized.TestCase):
257405
Blood pressure was 160/90 and patient was recommended to
258406
Atenolol 50 mg daily."""),
259407
start_pos=0,
260-
expected_interval=(0, 9),
408+
expected_interval=(0, 11),
261409
),
262410
)
263411
def test_partial_sentence_range(

0 commit comments

Comments
 (0)