Skip to content

Commit

Permalink
o mute e nimi pi toki Inli
Browse files Browse the repository at this point in the history
  • Loading branch information
gregdan3 committed Jul 3, 2024
1 parent f0706c6 commit 2c0403a
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 4 deletions.
1 change: 1 addition & 0 deletions src/sonatoki/Configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ class IloConfig(TypedDict):
"scoring_filters": [
Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
And(LongSyllabic, Not(FalsePosSyllabic)),
# NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
LongProperName,
LongAlphabetic,
],
Expand Down
83 changes: 81 additions & 2 deletions src/sonatoki/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,8 +519,10 @@
"kxk", # ken ala ken
"wxw", # wile ala wile
"msa",
"anusem",
}

# NOTE: This is being tracked manually rather than fetched from syllabics.txt until I am convinced that solution is appropriate
FALSE_POS_SYLLABIC = {
# ordered by frequency in previous TPT data
"like",
Expand All @@ -540,6 +542,7 @@
"man",
# "son", # sona typo?
"joke",
# pon would go here
"so",
"ten",
"make",
Expand All @@ -548,11 +551,14 @@
# "aka" # in sandbox
"into",
"in",
"no",
"some",
# "papa",
"on",
"me",
"ipa",
"sun",
"mine",
"sense",
"none",
"meme",
Expand All @@ -561,28 +567,101 @@
"mon",
"take",
"luna",
"anti",
"elo",
"japanese",
"an",
"anti",
"win",
"won",
"we",
"we", # word in sandbox
"men",
"ton",
"woke",
"sen", # seen
"se", # see
"semi",
"male",
# "pen", # borderline
"woman",
"line",
"meta",
"mini",
"sine",
# "min", # borderline
"oposite",
"anime",
"potato",
# "japan",
"nose",
"kilo",
"alone",
"minute",
"late",
"women",
"leson",
"amen",
"tote",
"lame",
"online",
"tone",
"ate",
"mile",
"melon",
"tense",
"nonsense",
"nine",
"emo",
"unlike",
"lone",
# manual additions
"alike",
"amuse",
"antelope",
"antena",
"apetite",
"asasin",
"asasinate",
"asinine",
"asinine",
"asume",
"atone",
"awake",
"awaken",
"eliminate",
"elite",
"misuse",
"emanate",
"iluminate",
"imense",
"imitate",
"insane",
"insolate",
"insulate",
"intense",
"lemon",
"manipulate",
}

FALSE_POS_ALPHABETIC: Set[str] = {
"t",
"is",
"as",
"not",
"link",
"wait",
"lol",
"new",
"also",
"isn", # TODO: tokenizer....
"mean",
"means",
"it",
"moment",
"its",
"lmao",
"new",
"wel",
"makes",
}

UCSUR_RANGES = [
Expand Down
4 changes: 2 additions & 2 deletions tests/test_ilo.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def corpus_ilo() -> Ilo:

ALPHABETIC_MATCHES = [
"mi mtue o kama sona",
"mi mute o kma son", # this one is odd because `son` is an unintended phonetic match
"mi mute o kama kne snoa a",
"ni li tptpt",
"mi wile pana lon sptp",
Expand Down Expand Up @@ -120,6 +121,7 @@ def corpus_ilo() -> Ilo:
"a ton of insolate puke. make no amen, no joke.",
"I elope so, to an elite untaken tune, some unwise tone",
"insane asinine lemon awesome atone joke",
"insane asinine lemon awesome atone", # i got more clever
]

EXCESSIVE_ALPHABETICS = [
Expand Down Expand Up @@ -177,7 +179,6 @@ def corpus_ilo() -> Ilo:
FALSE_NEGATIVES = [
# emoticon should not be a problem
# a token that is one edit off a known word should be allowed
"mi mute o kma son", # this one is obnoxious because `son` did match phonetically before
"mi pnoa",
"tok",
"mut",
Expand All @@ -187,7 +188,6 @@ def corpus_ilo() -> Ilo:
]

FALSE_POSITIVES = [
"insane asinine lemon awesome atone",
"lete li ike x.x", # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
]

Expand Down
9 changes: 9 additions & 0 deletions tests/test_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@
NIMI_LINKU_CORE,
NIMI_PU_SYNONYMS,
NIMI_LINKU_COMMON,
FALSE_POS_SYLLABIC,
NIMI_LINKU_OBSCURE,
NIMI_LINKU_SANDBOX,
NIMI_LINKU_UNCOMMON,
FALSE_POS_ALPHABETIC,
)


Expand Down Expand Up @@ -76,3 +78,10 @@ def test_nimi_linku_properties(s: str):
assert Syllabic.filter(s), repr(s)
assert Phonotactic.filter(s), repr(s)
# Passing phonotactic implies all of the above


@given(st.sampled_from(list(FALSE_POS_ALPHABETIC)))
def test_false_pos_properties(s: str):
res_syllabic = Syllabic.filter(s)
res_alphabetic = Alphabetic.filter(s)
assert res_alphabetic and not res_syllabic

0 comments on commit 2c0403a

Please sign in to comment.