Skip to content

Commit 972f8a6

Browse files
committed
1 parent 41e0777 commit 972f8a6

File tree

14 files changed

+195
-138
lines changed

14 files changed

+195
-138
lines changed

.github/workflows/tests.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,9 @@ jobs:
5454
5555
tests:
5656
name: Test
57-
needs: Validate
57+
needs: validate
5858
strategy:
59-
fail-fast: true
59+
fail-fast: false
6060
matrix:
6161
os: [ubuntu-latest, windows-latest, macos-latest]
6262
python_version: ["3.9", "3.12", "3.13"]

spacy/cli/_util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def parse_config_overrides(
9999
RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
100100
"""
101101
env_string = os.environ.get(env_var, "") if env_var else ""
102-
env_overrides = _parse_overrides(split_arg_string(env_string))
102+
env_overrides = _parse_overrides(split_arg_string(env_string)) # type: ignore[operator]
103103
cli_overrides = _parse_overrides(args, is_cli=True)
104104
if cli_overrides:
105105
keys = [k for k in cli_overrides if k not in env_overrides]

spacy/lang/ht/__init__.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
from ...language import BaseDefaults, Language
66
from .lemmatizer import HaitianCreoleLemmatizer
77
from .lex_attrs import LEX_ATTRS
8-
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
8+
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
99
from .stop_words import STOP_WORDS
1010
from .syntax_iterators import SYNTAX_ITERATORS
11-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
1211
from .tag_map import TAG_MAP
12+
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
1313

1414

1515
class HaitianCreoleDefaults(BaseDefaults):
@@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
2222
stop_words = STOP_WORDS
2323
tag_map = TAG_MAP
2424

25+
2526
class HaitianCreole(Language):
2627
lang = "ht"
2728
Defaults = HaitianCreoleDefaults
2829

30+
2931
@HaitianCreole.factory(
3032
"lemmatizer",
3133
assigns=["token.lemma"],
@@ -49,4 +51,5 @@ def make_lemmatizer(
4951
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
5052
)
5153

54+
5255
__all__ = ["HaitianCreole"]

spacy/lang/ht/lemmatizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from typing import List, Tuple
22

3+
from ...lookups import Lookups
34
from ...pipeline import Lemmatizer
45
from ...tokens import Token
5-
from ...lookups import Lookups
66

77

88
class HaitianCreoleLemmatizer(Lemmatizer):

spacy/lang/ht/lex_attrs.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
"P": "Pa",
5050
}
5151

52+
5253
def like_num(text):
5354
text = text.strip().lower()
5455
if text.startswith(("+", "-", "±", "~")):
@@ -69,9 +70,11 @@ def like_num(text):
6970
return True
7071
return False
7172

73+
7274
def norm_custom(text):
7375
return NORM_MAP.get(text, text.lower())
7476

77+
7578
LEX_ATTRS = {
7679
LIKE_NUM: like_num,
7780
NORM: norm_custom,

spacy/lang/ht/punctuation.py

Lines changed: 40 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
ALPHA_UPPER,
55
CONCAT_QUOTES,
66
HYPHENS,
7-
LIST_PUNCT,
8-
LIST_QUOTES,
97
LIST_ELLIPSES,
108
LIST_ICONS,
9+
LIST_PUNCT,
10+
LIST_QUOTES,
1111
merge_chars,
1212
)
1313

@@ -16,28 +16,43 @@
1616
_prefixes_elision = "m n l y t k w"
1717
_prefixes_elision += " " + _prefixes_elision.upper()
1818

19-
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
20-
r"(?:({pe})[{el}])(?=[{a}])".format(
21-
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
22-
)
23-
]
19+
TOKENIZER_PREFIXES = (
20+
LIST_PUNCT
21+
+ LIST_QUOTES
22+
+ [
23+
r"(?:({pe})[{el}])(?=[{a}])".format(
24+
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
25+
)
26+
]
27+
)
2428

25-
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
26-
r"(?<=[0-9])%", # numbers like 10%
27-
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
28-
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
29-
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
30-
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
31-
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string
32-
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
33-
]
29+
TOKENIZER_SUFFIXES = (
30+
LIST_PUNCT
31+
+ LIST_QUOTES
32+
+ LIST_ELLIPSES
33+
+ [
34+
r"(?<=[0-9])%", # numbers like 10%
35+
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
36+
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
37+
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
38+
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
39+
r"(?<=[{a}])\.(?=\s|$)".format(
40+
a=ALPHA
41+
), # period after letter if space or end of string
42+
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
43+
]
44+
)
3445

35-
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
36-
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
37-
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
38-
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
39-
),
40-
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
41-
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
42-
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
43-
]
46+
TOKENIZER_INFIXES = (
47+
LIST_ELLIPSES
48+
+ LIST_ICONS
49+
+ [
50+
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
51+
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
52+
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
53+
),
54+
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
55+
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
56+
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
57+
]
58+
)

spacy/lang/ht/stop_words.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,7 @@
3939
4040
men mèsi oswa osinon
4141
42-
"""
43-
.split()
42+
""".split()
4443
)
4544

4645
# Add common contractions, with and without apostrophe variants

spacy/lang/ht/tag_map.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,22 @@
1-
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
1+
from spacy.symbols import (
2+
ADJ,
3+
ADP,
4+
ADV,
5+
AUX,
6+
CCONJ,
7+
DET,
8+
INTJ,
9+
NOUN,
10+
NUM,
11+
PART,
12+
PRON,
13+
PROPN,
14+
PUNCT,
15+
SCONJ,
16+
SYM,
17+
VERB,
18+
X,
19+
)
220

321
TAG_MAP = {
422
"NOUN": {"pos": NOUN},

0 commit comments

Comments
 (0)