From 2c0403a4d165607666c78f890b29d918a0ecae6f Mon Sep 17 00:00:00 2001
From: Gregory Danielson <gregory.danielson3@gmail.com>
Date: Wed, 3 Jul 2024 09:58:11 -0500
Subject: [PATCH] o mute e nimi pi toki Inli

---
 src/sonatoki/Configs.py   |  1 +
 src/sonatoki/constants.py | 83 ++++++++++++++++++++++++++++++++++++++-
 tests/test_ilo.py         |  4 +-
 tests/test_properties.py  |  9 +++++
 4 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/src/sonatoki/Configs.py b/src/sonatoki/Configs.py
index f25139e..f2e67f3 100644
--- a/src/sonatoki/Configs.py
+++ b/src/sonatoki/Configs.py
@@ -73,6 +73,7 @@ class IloConfig(TypedDict):
     "scoring_filters": [
         Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
         And(LongSyllabic, Not(FalsePosSyllabic)),
+        # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
         LongProperName,
         LongAlphabetic,
     ],
diff --git a/src/sonatoki/constants.py b/src/sonatoki/constants.py
index 086ea12..ca15d06 100644
--- a/src/sonatoki/constants.py
+++ b/src/sonatoki/constants.py
@@ -519,8 +519,10 @@
     "kxk",  # ken ala ken
     "wxw",  # wile ala wile
     "msa",
+    "anusem",
 }
 
+# NOTE: This is being tracked manually rather than fetched from syllabics.txt until I am convinced that solution is appropriate
 FALSE_POS_SYLLABIC = {
     # ordered by frequency in previous TPT data
     "like",
@@ -540,6 +542,7 @@
     "man",
     # "son",  # sona typo?
     "joke",
+    # pon would go here
     "so",
     "ten",
     "make",
@@ -548,11 +551,14 @@
     # "aka" # in sandbox
     "into",
     "in",
+    "no",
     "some",
+    # "papa",
     "on",
     "me",
     "ipa",
     "sun",
+    "mine",
     "sense",
     "none",
     "meme",
@@ -561,28 +567,101 @@
     "mon",
     "take",
     "luna",
-    "anti",
     "elo",
+    "japanese",
     "an",
+    "anti",
     "win",
     "won",
-    "we",
+    "we",  # word in sandbox
     "men",
     "ton",
     "woke",
+    "sen",  # seen
+    "se",  # see
     "semi",
     "male",
+    # "pen",  # borderline
+    "woman",
+    "line",
+    "meta",
+    "mini",
+    "sine",
+    # "min",  # borderline
+    "oposite",
+    "anime",
+    "potato",
+    # "japan",
+    "nose",
+    "kilo",
+    "alone",
+    "minute",
+    "late",
+    "women",
+    "leson",
+    "amen",
+    "tote",
+    "lame",
+    "online",
+    "tone",
+    "ate",
+    "mile",
+    "melon",
+    "tense",
+    "nonsense",
+    "nine",
+    "emo",
+    "unlike",
+    "lone",
+    # manual additions
+    "alike",
+    "amuse",
+    "antelope",
+    "antena",
+    "apetite",
+    "asasin",
+    "asasinate",
+    "asinine",
+    "asinine",
+    "asume",
+    "atone",
+    "awake",
+    "awaken",
+    "eliminate",
+    "elite",
+    "misuse",
+    "emanate",
+    "iluminate",
+    "imense",
+    "imitate",
+    "insane",
+    "insolate",
+    "insulate",
+    "intense",
+    "lemon",
+    "manipulate",
 }
 
 FALSE_POS_ALPHABETIC: Set[str] = {
     "t",
     "is",
+    "as",
     "not",
+    "link",
+    "wait",
     "lol",
+    "new",
     "also",
     "isn",  # TODO: tokenizer....
     "mean",
     "means",
+    "it",
+    "moment",
+    "its",
+    "lmao",
+    "new",
+    "wel",
+    "makes",
 }
 
 UCSUR_RANGES = [
diff --git a/tests/test_ilo.py b/tests/test_ilo.py
index d05d058..9576e93 100644
--- a/tests/test_ilo.py
+++ b/tests/test_ilo.py
@@ -73,6 +73,7 @@ def corpus_ilo() -> Ilo:
 
 ALPHABETIC_MATCHES = [
     "mi mtue o kama sona",
+    "mi mute o kma son",  # this one is odd because `son` is an unintended phonetic match
     "mi mute o kama kne snoa a",
     "ni li tptpt",
     "mi wile pana lon sptp",
@@ -120,6 +121,7 @@ def corpus_ilo() -> Ilo:
     "a ton of insolate puke. make no amen, no joke.",
     "I elope so, to an elite untaken tune, some unwise tone",
     "insane asinine lemon awesome atone joke",
+    "insane asinine lemon awesome atone",  # i got more clever
 ]
 
 EXCESSIVE_ALPHABETICS = [
@@ -177,7 +179,6 @@ def corpus_ilo() -> Ilo:
 FALSE_NEGATIVES = [
     # emoticon should not be a problem
     # a token that is one edit off a known word should be allowed
-    "mi mute o kma son",  # this one is obnoxious because `son` did match phonetically before
     "mi pnoa",
     "tok",
     "mut",
@@ -187,7 +188,6 @@ def corpus_ilo() -> Ilo:
 ]
 
 FALSE_POSITIVES = [
-    "insane asinine lemon awesome atone",
     "lete li ike x.x",  # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
 ]
 
diff --git a/tests/test_properties.py b/tests/test_properties.py
index 6d7e134..1c6a152 100644
--- a/tests/test_properties.py
+++ b/tests/test_properties.py
@@ -25,9 +25,11 @@
     NIMI_LINKU_CORE,
     NIMI_PU_SYNONYMS,
     NIMI_LINKU_COMMON,
+    FALSE_POS_SYLLABIC,
     NIMI_LINKU_OBSCURE,
     NIMI_LINKU_SANDBOX,
     NIMI_LINKU_UNCOMMON,
+    FALSE_POS_ALPHABETIC,
 )
 
 
@@ -76,3 +78,10 @@ def test_nimi_linku_properties(s: str):
     assert Syllabic.filter(s), repr(s)
     assert Phonotactic.filter(s), repr(s)
     # Passing phonotactic implies all of the above
+
+
+@given(st.sampled_from(list(FALSE_POS_ALPHABETIC)))
+def test_false_pos_properties(s: str):
+    res_syllabic = Syllabic.filter(s)
+    res_alphabetic = Alphabetic.filter(s)
+    assert res_alphabetic and not res_syllabic