sin: ilo MemberFilter o ken e pana nimi e weka nimi (fixes #7)

gregdan3 · Aug 14, 2024 · 185b704 · 185b704
1 parent 6b2a2a3
commit 185b704
Show file tree

Hide file tree

Showing 3 changed files with 80 additions and 40 deletions.
diff --git a/src/sonatoki/Configs.py b/src/sonatoki/Configs.py
@@ -1,6 +1,6 @@
 # STL
 from copy import deepcopy
-from typing import Set, List, Type, TypedDict, cast
+from typing import List, Type, TypedDict
 
 # PDM
 from typing_extensions import NotRequired
@@ -12,13 +12,11 @@
     Not,
     Filter,
     Numeric,
-    Syllabic,
     NimiUCSUR,
     Alphabetic,
     NimiKuLili,
     NimiKuSuli,
     ProperName,
-    Phonotactic,
     Punctuation,
     LongSyllabic,
     Miscellaneous,
@@ -44,6 +42,34 @@
     AngleBracketObject,
 )
 
+__DICT_PHONOMATCHES = {
+    # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
+    # In this case, all of these appear more often in English by a factor of at least 10.
+    "aka",  # also known as
+    "an",  # article
+    "api",  # API
+    "i",  # 1st person
+    "kana",  # japanese script
+    "me",  # 1st person singular, english
+    "ne",  # "no" in several languages
+    "nu",  # "new" in english, "now" in dutch
+    "se",  # spanish particle, english "see"
+    "take",  # acquire, perhaps forcefully or without permission
+    "ten",  # 10
+    "to",  # to, too
+    "je",  # 1st person pronoun, french
+    "u",  # no u
+    "we",  # 1st person plural, english
+    "wi",  # wii and discussions of syllables
+    "sole",  # singular, of shoe
+    # unexplored candidates for removal
+    # "omen",  # ominous
+    # "papa",  # father
+    # "lo",  # "lo" and "loo"
+    # "ewe",  # sheep
+    # "pa",  # father- eh?
+}
+
 
 class IloConfig(TypedDict):
     preprocessors: List[Type[Preprocessor]]
@@ -92,8 +118,8 @@ class IloConfig(TypedDict):
             NimiLinkuCore,
             NimiLinkuCommon,
             NimiLinkuUncommon,
-            NimiLinkuObscure,
-            NimiLinkuSandbox,
+            NimiLinkuObscure(sub=__DICT_PHONOMATCHES),
+            NimiLinkuSandbox(sub=__DICT_PHONOMATCHES),
             NimiUCSUR,
             Miscellaneous,
         ),
@@ -104,40 +130,6 @@ class IloConfig(TypedDict):
     "scorer": SoftScaling,
     "passing_score": 0.8,
 }
-
-# TODO: create a mechanism to omit tokens from a filter with more granularity
-__corpus_tokens_dict: Set[str] = cast(
-    Set[str],
-    CorpusConfig["scoring_filters"][
-        0
-    ].tokens,  # pyright: ignore[reportAttributeAccessIssue]
-)
-__corpus_tokens_dict -= {
-    # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
-    # In this case, all of these appear more often in English by a factor of at least 10.
-    "aka",  # also known as
-    "an",  # article
-    "api",  # API
-    "i",  # 1st person
-    "kana",  # japanese script
-    "me",  # 1st person
-    "ne",  # "no" in several languages
-    "nu",  # "new", now in dutch
-    "se",  # spanish particle, "see"
-    "take",  # acquire, perhaps forcefully or without permission
-    "ten",  # 10
-    "to",  # to, too
-    "u",  # no u
-    "we",  # 1st person plural
-    "wi",  # wii and discussions of syllables
-    "sole",  # singular, of shoe
-    # unexplored candidates for removal
-    # "omen",  # ominous
-    # "papa",  # father
-    # "lo",  # "lo" and "loo"
-    # "ewe",  # sheep
-    # "pa",  # father- eh?
-}
 """Mimics the previous implementation of ilo pi toki pona taso."""
 LazyConfig: IloConfig = {
     "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],

diff --git a/src/sonatoki/Filters.py b/src/sonatoki/Filters.py
@@ -1,7 +1,8 @@
 # STL
 import re
 from abc import ABC, abstractmethod
-from typing import Set, List, Type
+from copy import deepcopy
+from typing import Set, List, Type, Optional
 from functools import lru_cache as cache  # cache comes in 3.9
 
 # PDM
@@ -101,6 +102,20 @@ class MemberFilter(Filter):
     def filter(cls, token: str) -> bool:
         return token.lower() in cls.tokens
 
+    def __new__(
+        cls, add: Optional[Set[str]] = None, sub: Optional[Set[str]] = None
+    ) -> Type[Filter]:
+        parent_tokens = deepcopy(cls.tokens)
+        if add:
+            parent_tokens = parent_tokens.union(add)
+        if sub:
+            parent_tokens -= sub
+
+        class AnonMemberFilter(MemberFilter):
+            tokens = parent_tokens
+
+        return AnonMemberFilter
+
 
 class SubsetFilter(Filter):
     tokens: Set[str]

diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -280,3 +280,36 @@ def test_AndNotFilter(s: str):
     if res_fp:
         # syl matched- but if fp matches, then the composed filter should not match
         assert not res_composed
+
+
+@given(st.sampled_from(list(NIMI_PU | NIMI_KU_SULI)))
+def test_AddTokensToMemberFilter(s: str):
+    PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
+    assert PuEnKuSuliFilter.filter(s)
+
+
+@given(st.sampled_from(list(NIMI_LINKU_SANDBOX | NIMI_KU_LILI)))
+def test_AddTokensToMemberFilterNegative(s: str):
+    PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
+    assert not PuEnKuSuliFilter.filter(s)
+
+
+@given(
+    st.sampled_from(
+        list(
+            NIMI_PU
+            | NIMI_KU_SULI
+            | NIMI_KU_LILI
+            | NIMI_LINKU_UNCOMMON
+            | NIMI_LINKU_OBSCURE
+            | NIMI_LINKU_SANDBOX
+        ),
+    )
+    | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
+)
+def test_SubTokensFromMemberFilter(s: str):
+    NimiAlaFilter = NimiLinkuCore(sub=NimiPu.tokens)
+    # core is a strict subset of pu
+    # if kin becomes core, needs to be corrected
+
+    assert not NimiAlaFilter.filter(s)