Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ jobs:

tests:
name: Test
needs: Validate
needs: validate
strategy:
fail-fast: true
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python_version: ["3.9", "3.12", "3.13"]
Expand Down
2 changes: 1 addition & 1 deletion spacy/cli/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def parse_config_overrides(
RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
"""
env_string = os.environ.get(env_var, "") if env_var else ""
env_overrides = _parse_overrides(split_arg_string(env_string))
env_overrides = _parse_overrides(split_arg_string(env_string)) # type: ignore[operator]
cli_overrides = _parse_overrides(args, is_cli=True)
if cli_overrides:
keys = [k for k in cli_overrides if k not in env_overrides]
Expand Down
7 changes: 5 additions & 2 deletions spacy/lang/ht/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
from ...language import BaseDefaults, Language
from .lemmatizer import HaitianCreoleLemmatizer
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS


class HaitianCreoleDefaults(BaseDefaults):
Expand All @@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
stop_words = STOP_WORDS
tag_map = TAG_MAP


class HaitianCreole(Language):
lang = "ht"
Defaults = HaitianCreoleDefaults


@HaitianCreole.factory(
"lemmatizer",
assigns=["token.lemma"],
Expand All @@ -49,4 +51,5 @@ def make_lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)


__all__ = ["HaitianCreole"]
2 changes: 1 addition & 1 deletion spacy/lang/ht/lemmatizer.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from typing import List, Tuple

from ...lookups import Lookups
from ...pipeline import Lemmatizer
from ...tokens import Token
from ...lookups import Lookups


class HaitianCreoleLemmatizer(Lemmatizer):
Expand Down
3 changes: 3 additions & 0 deletions spacy/lang/ht/lex_attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
"P": "Pa",
}


def like_num(text):
text = text.strip().lower()
if text.startswith(("+", "-", "±", "~")):
Expand All @@ -69,9 +70,11 @@ def like_num(text):
return True
return False


def norm_custom(text):
return NORM_MAP.get(text, text.lower())


LEX_ATTRS = {
LIKE_NUM: like_num,
NORM: norm_custom,
Expand Down
65 changes: 40 additions & 25 deletions spacy/lang/ht/punctuation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
ALPHA_UPPER,
CONCAT_QUOTES,
HYPHENS,
LIST_PUNCT,
LIST_QUOTES,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
merge_chars,
)

Expand All @@ -16,28 +16,43 @@
_prefixes_elision = "m n l y t k w"
_prefixes_elision += " " + _prefixes_elision.upper()

TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
r"(?:({pe})[{el}])(?=[{a}])".format(
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
)
]
TOKENIZER_PREFIXES = (
LIST_PUNCT
+ LIST_QUOTES
+ [
r"(?:({pe})[{el}])(?=[{a}])".format(
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
)
]
)

TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
r"(?<=[0-9])%", # numbers like 10%
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
]
TOKENIZER_SUFFIXES = (
LIST_PUNCT
+ LIST_QUOTES
+ LIST_ELLIPSES
+ [
r"(?<=[0-9])%", # numbers like 10%
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
r"(?<=[{a}])\.(?=\s|$)".format(
a=ALPHA
), # period after letter if space or end of string
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
]
)

TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
]
TOKENIZER_INFIXES = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
]
)
3 changes: 1 addition & 2 deletions spacy/lang/ht/stop_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,7 @@

men mèsi oswa osinon

"""
.split()
""".split()
)

# Add common contractions, with and without apostrophe variants
Expand Down
20 changes: 19 additions & 1 deletion spacy/lang/ht/tag_map.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
from spacy.symbols import (
ADJ,
ADP,
ADV,
AUX,
CCONJ,
DET,
INTJ,
NOUN,
NUM,
PART,
PRON,
PROPN,
PUNCT,
SCONJ,
SYM,
VERB,
X,
)

TAG_MAP = {
"NOUN": {"pos": NOUN},
Expand Down
Loading
Loading