From 2c0403a4d165607666c78f890b29d918a0ecae6f Mon Sep 17 00:00:00 2001 From: Gregory Danielson Date: Wed, 3 Jul 2024 09:58:11 -0500 Subject: [PATCH] o mute e nimi pi toki Inli --- src/sonatoki/Configs.py | 1 + src/sonatoki/constants.py | 83 ++++++++++++++++++++++++++++++++++++++- tests/test_ilo.py | 4 +- tests/test_properties.py | 9 +++++ 4 files changed, 93 insertions(+), 4 deletions(-) diff --git a/src/sonatoki/Configs.py b/src/sonatoki/Configs.py index f25139e..f2e67f3 100644 --- a/src/sonatoki/Configs.py +++ b/src/sonatoki/Configs.py @@ -73,6 +73,7 @@ class IloConfig(TypedDict): "scoring_filters": [ Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous), And(LongSyllabic, Not(FalsePosSyllabic)), + # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong LongProperName, LongAlphabetic, ], diff --git a/src/sonatoki/constants.py b/src/sonatoki/constants.py index 086ea12..ca15d06 100644 --- a/src/sonatoki/constants.py +++ b/src/sonatoki/constants.py @@ -519,8 +519,10 @@ "kxk", # ken ala ken "wxw", # wile ala wile "msa", + "anusem", } +# NOTE: This is being tracked manually rather than fetched from syllabics.txt until I am convinced that solution is appropriate FALSE_POS_SYLLABIC = { # ordered by frequency in previous TPT data "like", @@ -540,6 +542,7 @@ "man", # "son", # sona typo? "joke", + # pon would go here "so", "ten", "make", @@ -548,11 +551,14 @@ # "aka" # in sandbox "into", "in", + "no", "some", + # "papa", "on", "me", "ipa", "sun", + "mine", "sense", "none", "meme", @@ -561,28 +567,101 @@ "mon", "take", "luna", - "anti", "elo", + "japanese", "an", + "anti", "win", "won", - "we", + "we", # word in sandbox "men", "ton", "woke", + "sen", # seen + "se", # see "semi", "male", + # "pen", # borderline + "woman", + "line", + "meta", + "mini", + "sine", + # "min", # borderline + "oposite", + "anime", + "potato", + # "japan", + "nose", + "kilo", + "alone", + "minute", + "late", + "women", + "leson", + "amen", + "tote", + "lame", + "online", + "tone", + "ate", + "mile", + "melon", + "tense", + "nonsense", + "nine", + "emo", + "unlike", + "lone", + # manual additions + "alike", + "amuse", + "antelope", + "antena", + "apetite", + "asasin", + "asasinate", + "asinine", + "asinine", + "asume", + "atone", + "awake", + "awaken", + "eliminate", + "elite", + "misuse", + "emanate", + "iluminate", + "imense", + "imitate", + "insane", + "insolate", + "insulate", + "intense", + "lemon", + "manipulate", } FALSE_POS_ALPHABETIC: Set[str] = { "t", "is", + "as", "not", + "link", + "wait", "lol", + "new", "also", "isn", # TODO: tokenizer.... "mean", "means", + "it", + "moment", + "its", + "lmao", + "new", + "wel", + "makes", } UCSUR_RANGES = [ diff --git a/tests/test_ilo.py b/tests/test_ilo.py index d05d058..9576e93 100644 --- a/tests/test_ilo.py +++ b/tests/test_ilo.py @@ -73,6 +73,7 @@ def corpus_ilo() -> Ilo: ALPHABETIC_MATCHES = [ "mi mtue o kama sona", + "mi mute o kma son", # this one is odd because `son` is an unintended phonetic match "mi mute o kama kne snoa a", "ni li tptpt", "mi wile pana lon sptp", @@ -120,6 +121,7 @@ def corpus_ilo() -> Ilo: "a ton of insolate puke. make no amen, no joke.", "I elope so, to an elite untaken tune, some unwise tone", "insane asinine lemon awesome atone joke", + "insane asinine lemon awesome atone", # i got more clever ] EXCESSIVE_ALPHABETICS = [ @@ -177,7 +179,6 @@ def corpus_ilo() -> Ilo: FALSE_NEGATIVES = [ # emoticon should not be a problem # a token that is one edit off a known word should be allowed - "mi mute o kma son", # this one is obnoxious because `son` did match phonetically before "mi pnoa", "tok", "mut", @@ -187,7 +188,6 @@ def corpus_ilo() -> Ilo: ] FALSE_POSITIVES = [ - "insane asinine lemon awesome atone", "lete li ike x.x", # this is an emoticon but passes because 'x' is in Filters.Miscellaneous ] diff --git a/tests/test_properties.py b/tests/test_properties.py index 6d7e134..1c6a152 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -25,9 +25,11 @@ NIMI_LINKU_CORE, NIMI_PU_SYNONYMS, NIMI_LINKU_COMMON, + FALSE_POS_SYLLABIC, NIMI_LINKU_OBSCURE, NIMI_LINKU_SANDBOX, NIMI_LINKU_UNCOMMON, + FALSE_POS_ALPHABETIC, ) @@ -76,3 +78,10 @@ def test_nimi_linku_properties(s: str): assert Syllabic.filter(s), repr(s) assert Phonotactic.filter(s), repr(s) # Passing phonotactic implies all of the above + + +@given(st.sampled_from(list(FALSE_POS_ALPHABETIC))) +def test_false_pos_properties(s: str): + res_syllabic = Syllabic.filter(s) + res_alphabetic = Alphabetic.filter(s) + assert res_alphabetic and not res_syllabic