Skip to content

Commit

Permalink
o sin e sona Linku
Browse files Browse the repository at this point in the history
ni li wile e ante lili poka mute

- nimi sin pi mute lili o lon poki pi toki ante
- nimi pi mute lili o weka tan ilo CorpusConfig tan ni: ona li lon toki
ante
  • Loading branch information
gregdan3 committed Jul 26, 2024
1 parent 4aff04f commit 481cdfb
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 12 deletions.
23 changes: 22 additions & 1 deletion src/sonatoki/Configs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# STL
from copy import deepcopy
from typing import List, Type, TypedDict
from typing import Set, List, Type, TypedDict, cast

# PDM
from typing_extensions import NotRequired
Expand All @@ -18,6 +18,7 @@
NimiKuLili,
NimiKuSuli,
ProperName,
Phonotactic,
Punctuation,
LongSyllabic,
Miscellaneous,
Expand Down Expand Up @@ -102,6 +103,26 @@ class IloConfig(TypedDict):
"scorer": SoftScaling,
"passing_score": 0.8,
}

# TODO: create a mechanism to omit tokens from a filter with more granularity
__corpus_tokens_dict: Set[str] = cast(
Set[str],
CorpusConfig["scoring_filters"][
0
].tokens, # pyright: ignore[reportAttributeAccessIssue]
)
__corpus_tokens_dict -= {
"an",
"i",
"me",
"ne",
"se",
"take",
"ten",
"to",
"u",
"we",
}
"""Mimics the previous implementation of ilo pi toki pona taso."""
LazyConfig: IloConfig = {
"preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
Expand Down
9 changes: 6 additions & 3 deletions src/sonatoki/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@
"in",
"no",
"some",
# "papa",
# "papa", # now in sandbox
"on",
"me",
"ipa",
Expand Down Expand Up @@ -591,7 +591,7 @@
"oposite",
"anime",
"potato",
# "japan",
"japan",
"nose",
"kilo",
"alone",
Expand Down Expand Up @@ -629,17 +629,20 @@
"awaken",
"eliminate",
"elite",
"misuse",
"emanate",
"iluminate",
"imense",
"imitate",
"injoke",
"insane",
"insolate",
"insulate",
"intense",
"lemon",
"manipulate",
"misuse",
"ne", # "no" in many other languages
"wana",
}

FALSE_POS_ALPHABETIC: Set[str] = {
Expand Down
2 changes: 1 addition & 1 deletion src/sonatoki/linku.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/sonatoki/sandbox.json

Large diffs are not rendered by default.

9 changes: 4 additions & 5 deletions tests/test_ilo.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,9 @@ def corpus_ilo() -> Ilo:
'jasima omekapo, ki nimisin "jasima enko nimisin". ki enko alu linluwi Jutu alu epiku ki epiku baba is you. ki likujo "SINtelen pona", ki epiku alu "sitelen pona". ki kepen wawajete isipin, kin ki yupekosi alu lipamanka alu wawajete, kin ki enko isipin lipamanka linluwi alu wawajete',
"kalamARRRR",
"Pingo",
"we Luke",
]
CORPUS_SPECIFIC_XFAIL = [
"How to Cut a Kiwi",
"a e i o u",
"we Luke li alente wa",
]
CORPUS_SPECIFIC_XFAIL = []


EXCESSIVE_SYLLABICS = [
Expand Down Expand Up @@ -156,6 +153,8 @@ def corpus_ilo() -> Ilo:
"I see :)",
"I wanna see", # same down to here
"i'm online all the time",
"How to Cut a Kiwi",
"a e i o u",
]

NON_MATCHES = [
Expand Down
2 changes: 1 addition & 1 deletion tests/test_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test_ku_filters_non_overlap(s: str):
| NIMI_LINKU_COMMON
| NIMI_LINKU_UNCOMMON
| NIMI_LINKU_OBSCURE
| NIMI_LINKU_SANDBOX
| NIMI_LINKU_SANDBOX - {"su"}
)
)
)
Expand Down

0 comments on commit 481cdfb

Please sign in to comment.