|
16 | 16 | _prefixes_elision = "m n l y t k w"
|
17 | 17 | _prefixes_elision += " " + _prefixes_elision.upper()
|
18 | 18 |
|
19 |
| -TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [ |
20 |
| - r"(?:({pe})[{el}])(?=[{a}])".format( |
21 |
| - a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) |
22 |
| - ) |
23 |
| -] |
| 19 | +TOKENIZER_PREFIXES = ( |
| 20 | + LIST_PUNCT |
| 21 | + + LIST_QUOTES |
| 22 | + + [ |
| 23 | + r"(?:({pe})[{el}])(?=[{a}])".format( |
| 24 | + a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) |
| 25 | + ) |
| 26 | + ] |
| 27 | +) |
24 | 28 |
|
25 |
| -TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [ |
26 |
| - r"(?<=[0-9])%", # numbers like 10% |
27 |
| - r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers |
28 |
| - r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters |
29 |
| - r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions |
30 |
| - r"(?<=[{a}0-9])\)", # right parenthesis after letter/number |
31 |
| - r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string |
32 |
| - r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis |
33 |
| -] |
| 29 | +TOKENIZER_SUFFIXES = ( |
| 30 | + LIST_PUNCT |
| 31 | + + LIST_QUOTES |
| 32 | + + LIST_ELLIPSES |
| 33 | + + [ |
| 34 | + r"(?<=[0-9])%", # numbers like 10% |
| 35 | + r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers |
| 36 | + r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters |
| 37 | + r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions |
| 38 | + r"(?<=[{a}0-9])\)", # right parenthesis after letter/number |
| 39 | + r"(?<=[{a}])\.(?=\s|$)".format( |
| 40 | + a=ALPHA |
| 41 | + ), # period after letter if space or end of string |
| 42 | + r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis |
| 43 | + ] |
| 44 | +) |
34 | 45 |
|
35 |
| -TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [ |
36 |
| - r"(?<=[0-9])[+\-\*^](?=[0-9-])", |
37 |
| - r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( |
38 |
| - al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES |
39 |
| - ), |
40 |
| - r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), |
41 |
| - r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), |
42 |
| - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), |
43 |
| -] |
| 46 | +TOKENIZER_INFIXES = ( |
| 47 | + LIST_ELLIPSES |
| 48 | + + LIST_ICONS |
| 49 | + + [ |
| 50 | + r"(?<=[0-9])[+\-\*^](?=[0-9-])", |
| 51 | + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( |
| 52 | + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES |
| 53 | + ), |
| 54 | + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), |
| 55 | + r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), |
| 56 | + r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), |
| 57 | + ] |
| 58 | +) |
0 commit comments