Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Ligurian tokenization #13694

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion spacy/lang/lij/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from ...language import BaseDefaults, Language
from .punctuation import TOKENIZER_INFIXES
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS


class LigurianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES
stop_words = STOP_WORDS


Expand Down
2 changes: 1 addition & 1 deletion spacy/lang/lij/examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@
sentences = [
"Sciusciâ e sciorbî no se peu.",
"Graçie di çetroin, che me son arrivæ.",
"Vegnime apreuvo, che ve fasso pescâ di òmmi.",
"Vegnîme apreuvo, che ve fasso pescâ di òmmi.",
"Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.",
]
20 changes: 16 additions & 4 deletions spacy/lang/lij/punctuation.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,23 @@
from ..punctuation import (
TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES,
TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES,
)
from ..char_classes import ALPHA
from ..punctuation import TOKENIZER_INFIXES

ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")

ELISION = "'’"

_infixes = TOKENIZER_INFIXES + [
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)

_prefixes = [
r"['’‘][0-9]{2}", # shorthand for years
r"[0-9]+°(?![cfkCFK])", # use of degree symbol as ordinal indicator
r"[{el}‘]nn?[{el}]?".format(el=ELISION), # elided forms of "un(na)"
] + BASE_TOKENIZER_PREFIXES


_infixes = BASE_TOKENIZER_INFIXES + [
r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION),
]

TOKENIZER_PREFIXES = _prefixes
TOKENIZER_INFIXES = _infixes
30 changes: 16 additions & 14 deletions spacy/lang/lij/stop_words.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,40 @@
STOP_WORDS = set(
"""
a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei
a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuo apreuvo ascì atra atre atri atro avanti avei aveiva

bella belle belli bello ben
bell' bell’ bella belle belli bello ben

ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse
ch' ch’ che chì chi ciù co-a co-e co-i co-o comm' comm’ comme con contr' contr’ contra cösa coscì cöse

d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo
d' d’ da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo drent' drent’ dento

é e ê ea ean emmo en ëse
é à e ê ea ean emmo en ëse

fin fiña

gh' ghe guæei
gh' gh’ ghe guæi

i î in insemme int' inta inte inti into
i î in insemme int' int’ inta inte inti into

l' lê lì
l' l’ lê lì liatre liatri lô loiatre loiatri

m' ma manco me megio meno mezo mi
m' m’ ma mai manco me megio meno meza meze mezi mezo mi

na n' ne ni ninte nisciun nisciuña no
n' n’ na ne nì niatre niatri ninte nisciun nisciuña no noiatre noiatri

o ò ô oua

parte pe pe-a pe-i pe-e pe-o perché pittin pö primma pròpio

quæ quand' quande quarche quella quelle quelli quello
quæ quand' quand’ quande quarche quarcösa quell' quell’ quella quelle quelli quello

s' sce scê sci sciâ sciô sciù se segge seu sò solo son sott' sta stæta stæte stæti stæto ste sti sto
s' s’ sce scê scì scî scià sciâ sciô sciù se segge seu sò solo son sott' sott’ sotta sta stæta stæte stæti stæto ste sti sto

tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto
tant' tant’ tanta tante tanti tanto te teu tò ti torna tra tròppo tutt' tutt’ tutta tutte tutti tutto

un uña unn' unna
un uña unn' unn’ unna

voî voscià

za zu
""".split()
Expand Down
102 changes: 60 additions & 42 deletions spacy/lang/lij/tokenizer_exceptions.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,67 @@
from ...symbols import ORTH
from ...symbols import ORTH, NORM
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS


# Returns capitalized variants, all caps variants and with curly apostrophe
def _variants(orth, exc):
yield orth, exc
yield orth.capitalize(), [
{ORTH: e[ORTH].capitalize() if i == 0 else e[ORTH], NORM: e.get(NORM, e[ORTH])}
for i, e in enumerate(exc)
]
yield orth.upper(), [
{ORTH: e[ORTH].upper(), NORM: e.get(NORM, e[ORTH])} for e in exc
]
if "'" in orth:
yield from _variants(
orth.replace("'", "’"),
[
{ORTH: e[ORTH].replace("'", "’"), NORM: e.get(NORM, e[ORTH])}
for e in exc
],
)


_exc = {}

for raw in [
"a-e",
"a-o",
"a-i",
"a-a",
"co-a",
"co-e",
"co-i",
"co-o",
"da-a",
"da-e",
"da-i",
"da-o",
"pe-a",
"pe-e",
"pe-i",
"pe-o",
]:
for orth in [raw, raw.capitalize()]:
_exc[orth] = [{ORTH: orth}]

# Prefix + prepositions with à (e.g. "sott'a-o")

for prep in [
"a-a",
"a-e",
"a-o",
"a-i",
]:
for prefix in [
"sott'",
"sott’",
"contr'",
"contr’",
"ch'",
"ch’",
"s'",
"s’",
]:
for prefix_orth in [prefix, prefix.capitalize()]:
_exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]
# Compound prepositions

# Compounds with "inte" and "de" aren't split as they can be ambiguous
# Format: (compound form, isolated form, determiners it goes with)
_preps = [
("a-", "à", "oaie"),
("co-", "con", "oaie"),
("da-", "da", "oaie"),
("pe-", "pe", "oaie"),
("pi-", "pe", "a"), # colloquialism
("de-", "de", "oaie"), # incorrect, but occasionally seen
("ne-", "inte", "oaie"), # incorrect, but occasionally seen
]
for prep_, prep, dets in _preps:
for det in dets:
for orth, exc in _variants(
prep_ + det, [{ORTH: prep_, NORM: prep}, {ORTH: det}]
):
_exc[orth] = exc

# Units

for u in "cfkCFK":
_exc[f"°{u}"] = [{ORTH: f"°{u}"}]
_exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]

# Other exceptions

_other_exc = {
"'n'": [{ORTH: "'n'", NORM: "unna"}],
"‘n'": [{ORTH: "‘n'", NORM: "unna"}],
"'n": [{ORTH: "'n", NORM: "un"}],
"‘n": [{ORTH: "‘n", NORM: "un"}],
"tou": [{ORTH: "t", NORM: "te"}, {ORTH: "ou", NORM: "ô"}],
}
for orth_, exc_ in _other_exc.items():
for orth, exc in _variants(orth_, exc_):
_exc[orth] = exc

TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
5 changes: 5 additions & 0 deletions spacy/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,11 @@ def lg_tokenizer():
return get_lang_class("lg")().tokenizer


@pytest.fixture(scope="session")
def lij_tokenizer():
return get_lang_class("lij")().tokenizer


@pytest.fixture(scope="session")
def lt_tokenizer():
return get_lang_class("lt")().tokenizer
Expand Down
Empty file.
13 changes: 13 additions & 0 deletions spacy/tests/lang/lij/test_exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import pytest


@pytest.mark.parametrize(
"text,expected_tokens,expected_norms",
[("a-e", ["a-", "e"], ["à", "e"]), ("co-i", ["co-", "i"], ["con", "i"])],
)
def test_prepositions(lij_tokenizer, text, expected_tokens, expected_norms):
"""Test that compound prepositions are split correctly."""
tokens = lij_tokenizer(text)
assert len(tokens) == 2
assert [t.text for t in tokens] == expected_tokens
assert [t.norm_ for t in tokens] == expected_norms
24 changes: 24 additions & 0 deletions spacy/tests/lang/lij/test_prefix_suffix_infix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pytest


@pytest.mark.parametrize("text", ["'90", "’90", "‘90"])
def test_lij_tokenizer_handles_year_elision(lij_tokenizer, text):
"""Test that elided years (e.g. '90 for 1990) are not split."""
tokens = lij_tokenizer(text)
assert len(tokens) == 1


@pytest.mark.parametrize("text,expected_tokens", [("10°C", ["10", "°C"])])
def test_lij_tokenizer_handles_degrees(lij_tokenizer, text, expected_tokens):
"""Test that in degree units the degree symbol isn't split from the unit."""
tokens = lij_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list


@pytest.mark.parametrize("text,expected_tokens", [("'n'atra", ["'n'", "atra"])])
def test_lij_tokenizer_handles_left_elision(lij_tokenizer, text, expected_tokens):
"""Test that left-eliding expressions are not split from their left apostrophe."""
tokens = lij_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list