@@ -35,7 +35,7 @@ def assertTokenListEqual(self, actual_tokens, expected_tokens, msg=None):
3535 token_type = actual .token_type ,
3636 first_token_after_newline = actual .first_token_after_newline ,
3737 )
38- self .assertDataclassEqual (
38+ self .assertEqual (
3939 expected ,
4040 actual ,
4141 msg = f"Token mismatch at index { i } " ,
@@ -117,6 +117,166 @@ def assertTokenListEqual(self, actual_tokens, expected_tokens, msg=None):
117117 input_text = "" ,
118118 expected_tokens = [],
119119 ),
120+ dict (
121+ testcase_name = "numbers_with_slash" ,
122+ input_text = "Patient BP was 120/80 mmHg." ,
123+ expected_tokens = [
124+ tokenizer .Token (index = 0 , token_type = tokenizer .TokenType .WORD ),
125+ tokenizer .Token (index = 1 , token_type = tokenizer .TokenType .WORD ),
126+ tokenizer .Token (index = 2 , token_type = tokenizer .TokenType .WORD ),
127+ tokenizer .Token (index = 3 , token_type = tokenizer .TokenType .NUMBER ),
128+ tokenizer .Token (
129+ index = 4 , token_type = tokenizer .TokenType .PUNCTUATION
130+ ),
131+ tokenizer .Token (index = 5 , token_type = tokenizer .TokenType .NUMBER ),
132+ tokenizer .Token (index = 6 , token_type = tokenizer .TokenType .WORD ),
133+ tokenizer .Token (
134+ index = 7 , token_type = tokenizer .TokenType .PUNCTUATION
135+ ),
136+ ],
137+ ),
138+ dict (
139+ testcase_name = "decimals_and_alphanum_units" ,
140+ input_text = "Temp 98.6°F and dosage 50mg daily." ,
141+ expected_tokens = [
142+ tokenizer .Token (
143+ index = 0 , token_type = tokenizer .TokenType .WORD
144+ ), # Temp
145+ tokenizer .Token (
146+ index = 1 , token_type = tokenizer .TokenType .NUMBER
147+ ), # 98
148+ tokenizer .Token (
149+ index = 2 , token_type = tokenizer .TokenType .PUNCTUATION
150+ ), # .
151+ tokenizer .Token (
152+ index = 3 , token_type = tokenizer .TokenType .NUMBER
153+ ), # 6
154+ tokenizer .Token (
155+ index = 4 , token_type = tokenizer .TokenType .PUNCTUATION
156+ ), # °
157+ tokenizer .Token (
158+ index = 5 , token_type = tokenizer .TokenType .WORD
159+ ), # F
160+ tokenizer .Token (
161+ index = 6 , token_type = tokenizer .TokenType .WORD
162+ ), # and
163+ tokenizer .Token (
164+ index = 7 , token_type = tokenizer .TokenType .WORD
165+ ), # dosage
166+ tokenizer .Token (
167+ index = 8 , token_type = tokenizer .TokenType .NUMBER
168+ ), # 50
169+ tokenizer .Token (
170+ index = 9 , token_type = tokenizer .TokenType .WORD
171+ ), # mg
172+ tokenizer .Token (
173+ index = 10 , token_type = tokenizer .TokenType .WORD
174+ ), # daily
175+ tokenizer .Token (
176+ index = 11 , token_type = tokenizer .TokenType .PUNCTUATION
177+ ), # .
178+ ],
179+ ),
180+ dict (
181+ testcase_name = "japanese_text" ,
182+ input_text = "これはテストです。" ,
183+ expected_tokens = [
184+ tokenizer .Token (index = 0 , token_type = tokenizer .TokenType .WORD ),
185+ tokenizer .Token (
186+ index = 1 , token_type = tokenizer .TokenType .PUNCTUATION
187+ ),
188+ ],
189+ ),
190+ dict (
191+ testcase_name = "cjk_slash_abbreviation" ,
192+ input_text = "患者の血圧は120/80です。" ,
193+ expected_tokens = [
194+ tokenizer .Token (index = 0 , token_type = tokenizer .TokenType .WORD ),
195+ tokenizer .Token (index = 1 , token_type = tokenizer .TokenType .NUMBER ),
196+ tokenizer .Token (
197+ index = 2 , token_type = tokenizer .TokenType .PUNCTUATION
198+ ),
199+ tokenizer .Token (index = 3 , token_type = tokenizer .TokenType .NUMBER ),
200+ tokenizer .Token (index = 4 , token_type = tokenizer .TokenType .WORD ),
201+ tokenizer .Token (
202+ index = 5 , token_type = tokenizer .TokenType .PUNCTUATION
203+ ),
204+ ],
205+ ),
206+ dict (
207+ testcase_name = "devanagari_hindi_split" ,
208+ input_text = "नमस्ते दुनिया, मेरा स्कोर १००/१०० है।" ,
209+ expected_tokens = [
210+ tokenizer .Token (index = 0 , token_type = tokenizer .TokenType .WORD ),
211+ tokenizer .Token (
212+ index = 1 , token_type = tokenizer .TokenType .PUNCTUATION
213+ ),
214+ tokenizer .Token (index = 2 , token_type = tokenizer .TokenType .WORD ),
215+ tokenizer .Token (
216+ index = 3 , token_type = tokenizer .TokenType .PUNCTUATION
217+ ),
218+ tokenizer .Token (index = 4 , token_type = tokenizer .TokenType .WORD ),
219+ tokenizer .Token (
220+ index = 5 , token_type = tokenizer .TokenType .PUNCTUATION
221+ ),
222+ tokenizer .Token (index = 6 , token_type = tokenizer .TokenType .WORD ),
223+ tokenizer .Token (
224+ index = 7 , token_type = tokenizer .TokenType .PUNCTUATION
225+ ),
226+ tokenizer .Token (index = 8 , token_type = tokenizer .TokenType .WORD ),
227+ tokenizer .Token (
228+ index = 9 , token_type = tokenizer .TokenType .PUNCTUATION
229+ ),
230+ tokenizer .Token (index = 10 , token_type = tokenizer .TokenType .WORD ),
231+ tokenizer .Token (
232+ index = 11 , token_type = tokenizer .TokenType .PUNCTUATION
233+ ),
234+ tokenizer .Token (index = 12 , token_type = tokenizer .TokenType .WORD ),
235+ tokenizer .Token (
236+ index = 13 , token_type = tokenizer .TokenType .PUNCTUATION
237+ ),
238+ tokenizer .Token (index = 14 , token_type = tokenizer .TokenType .WORD ),
239+ tokenizer .Token (
240+ index = 15 , token_type = tokenizer .TokenType .PUNCTUATION
241+ ),
242+ tokenizer .Token (index = 16 , token_type = tokenizer .TokenType .WORD ),
243+ tokenizer .Token (
244+ index = 17 , token_type = tokenizer .TokenType .PUNCTUATION
245+ ),
246+ tokenizer .Token (index = 18 , token_type = tokenizer .TokenType .WORD ),
247+ tokenizer .Token (index = 19 , token_type = tokenizer .TokenType .NUMBER ),
248+ tokenizer .Token (
249+ index = 20 , token_type = tokenizer .TokenType .PUNCTUATION
250+ ),
251+ tokenizer .Token (index = 21 , token_type = tokenizer .TokenType .NUMBER ),
252+ tokenizer .Token (index = 22 , token_type = tokenizer .TokenType .WORD ),
253+ tokenizer .Token (
254+ index = 23 , token_type = tokenizer .TokenType .PUNCTUATION
255+ ),
256+ ],
257+ ),
258+ dict (
259+ testcase_name = "arabic_text" ,
260+ input_text = "مرحبا بالعالم! درجة الحرارة ٢٥.٥ درجة." ,
261+ expected_tokens = [
262+ tokenizer .Token (index = 0 , token_type = tokenizer .TokenType .WORD ),
263+ tokenizer .Token (index = 1 , token_type = tokenizer .TokenType .WORD ),
264+ tokenizer .Token (
265+ index = 2 , token_type = tokenizer .TokenType .PUNCTUATION
266+ ),
267+ tokenizer .Token (index = 3 , token_type = tokenizer .TokenType .WORD ),
268+ tokenizer .Token (index = 4 , token_type = tokenizer .TokenType .WORD ),
269+ tokenizer .Token (index = 5 , token_type = tokenizer .TokenType .NUMBER ),
270+ tokenizer .Token (
271+ index = 6 , token_type = tokenizer .TokenType .PUNCTUATION
272+ ),
273+ tokenizer .Token (index = 7 , token_type = tokenizer .TokenType .NUMBER ),
274+ tokenizer .Token (index = 8 , token_type = tokenizer .TokenType .WORD ),
275+ tokenizer .Token (
276+ index = 9 , token_type = tokenizer .TokenType .PUNCTUATION
277+ ),
278+ ],
279+ ),
120280 )
121281 def test_tokenize_various_inputs (self , input_text , expected_tokens ):
122282 tokenized = tokenizer .tokenize (input_text )
@@ -131,32 +291,20 @@ def test_first_token_after_newline_flag(self):
131291 tokenized = tokenizer .tokenize (input_text )
132292
133293 expected_tokens = [
134- tokenizer .Token (
135- index = 0 ,
136- token_type = tokenizer .TokenType .WORD ,
137- ),
138- tokenizer .Token (
139- index = 1 ,
140- token_type = tokenizer .TokenType .NUMBER ,
141- ),
294+ tokenizer .Token (index = 0 , token_type = tokenizer .TokenType .WORD ),
295+ tokenizer .Token (index = 1 , token_type = tokenizer .TokenType .NUMBER ),
142296 tokenizer .Token (
143297 index = 2 ,
144298 token_type = tokenizer .TokenType .WORD ,
145299 first_token_after_newline = True ,
146300 ),
147- tokenizer .Token (
148- index = 3 ,
149- token_type = tokenizer .TokenType .NUMBER ,
150- ),
301+ tokenizer .Token (index = 3 , token_type = tokenizer .TokenType .NUMBER ),
151302 tokenizer .Token (
152303 index = 4 ,
153304 token_type = tokenizer .TokenType .WORD ,
154305 first_token_after_newline = True ,
155306 ),
156- tokenizer .Token (
157- index = 5 ,
158- token_type = tokenizer .TokenType .NUMBER ,
159- ),
307+ tokenizer .Token (index = 5 , token_type = tokenizer .TokenType .NUMBER ),
160308 ]
161309
162310 self .assertTokenListEqual (
@@ -257,7 +405,7 @@ class SentenceRangeTest(parameterized.TestCase):
257405 Blood pressure was 160/90 and patient was recommended to
258406 Atenolol 50 mg daily.""" ),
259407 start_pos = 0 ,
260- expected_interval = (0 , 9 ),
408+ expected_interval = (0 , 11 ),
261409 ),
262410 )
263411 def test_partial_sentence_range (
0 commit comments