Skip to content

Commit

Permalink
sin: ilo MemberFilter o ken e pana nimi e weka nimi (fixes #7)
Browse files Browse the repository at this point in the history
  • Loading branch information
gregdan3 committed Aug 14, 2024
1 parent 6b2a2a3 commit 185b704
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 40 deletions.
70 changes: 31 additions & 39 deletions src/sonatoki/Configs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# STL
from copy import deepcopy
from typing import Set, List, Type, TypedDict, cast
from typing import List, Type, TypedDict

# PDM
from typing_extensions import NotRequired
Expand All @@ -12,13 +12,11 @@
Not,
Filter,
Numeric,
Syllabic,
NimiUCSUR,
Alphabetic,
NimiKuLili,
NimiKuSuli,
ProperName,
Phonotactic,
Punctuation,
LongSyllabic,
Miscellaneous,
Expand All @@ -44,6 +42,34 @@
AngleBracketObject,
)

__DICT_PHONOMATCHES = {
# Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
# In this case, all of these appear more often in English by a factor of at least 10.
"aka", # also known as
"an", # article
"api", # API
"i", # 1st person
"kana", # japanese script
"me", # 1st person singular, english
"ne", # "no" in several languages
"nu", # "new" in english, "now" in dutch
"se", # spanish particle, english "see"
"take", # acquire, perhaps forcefully or without permission
"ten", # 10
"to", # to, too
"je", # 1st person pronoun, french
"u", # no u
"we", # 1st person plural, english
"wi", # wii and discussions of syllables
"sole", # singular, of shoe
# unexplored candidates for removal
# "omen", # ominous
# "papa", # father
# "lo", # "lo" and "loo"
# "ewe", # sheep
# "pa", # father- eh?
}


class IloConfig(TypedDict):
preprocessors: List[Type[Preprocessor]]
Expand Down Expand Up @@ -92,8 +118,8 @@ class IloConfig(TypedDict):
NimiLinkuCore,
NimiLinkuCommon,
NimiLinkuUncommon,
NimiLinkuObscure,
NimiLinkuSandbox,
NimiLinkuObscure(sub=__DICT_PHONOMATCHES),
NimiLinkuSandbox(sub=__DICT_PHONOMATCHES),
NimiUCSUR,
Miscellaneous,
),
Expand All @@ -104,40 +130,6 @@ class IloConfig(TypedDict):
"scorer": SoftScaling,
"passing_score": 0.8,
}

# TODO: create a mechanism to omit tokens from a filter with more granularity
__corpus_tokens_dict: Set[str] = cast(
Set[str],
CorpusConfig["scoring_filters"][
0
].tokens, # pyright: ignore[reportAttributeAccessIssue]
)
__corpus_tokens_dict -= {
# Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
# In this case, all of these appear more often in English by a factor of at least 10.
"aka", # also known as
"an", # article
"api", # API
"i", # 1st person
"kana", # japanese script
"me", # 1st person
"ne", # "no" in several languages
"nu", # "new", now in dutch
"se", # spanish particle, "see"
"take", # acquire, perhaps forcefully or without permission
"ten", # 10
"to", # to, too
"u", # no u
"we", # 1st person plural
"wi", # wii and discussions of syllables
"sole", # singular, of shoe
# unexplored candidates for removal
# "omen", # ominous
# "papa", # father
# "lo", # "lo" and "loo"
# "ewe", # sheep
# "pa", # father- eh?
}
"""Mimics the previous implementation of ilo pi toki pona taso."""
LazyConfig: IloConfig = {
"preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
Expand Down
17 changes: 16 additions & 1 deletion src/sonatoki/Filters.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# STL
import re
from abc import ABC, abstractmethod
from typing import Set, List, Type
from copy import deepcopy
from typing import Set, List, Type, Optional
from functools import lru_cache as cache # cache comes in 3.9

# PDM
Expand Down Expand Up @@ -101,6 +102,20 @@ class MemberFilter(Filter):
def filter(cls, token: str) -> bool:
return token.lower() in cls.tokens

def __new__(
cls, add: Optional[Set[str]] = None, sub: Optional[Set[str]] = None
) -> Type[Filter]:
parent_tokens = deepcopy(cls.tokens)
if add:
parent_tokens = parent_tokens.union(add)
if sub:
parent_tokens -= sub

class AnonMemberFilter(MemberFilter):
tokens = parent_tokens

return AnonMemberFilter


class SubsetFilter(Filter):
tokens: Set[str]
Expand Down
33 changes: 33 additions & 0 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,3 +280,36 @@ def test_AndNotFilter(s: str):
if res_fp:
# syl matched- but if fp matches, then the composed filter should not match
assert not res_composed


@given(st.sampled_from(list(NIMI_PU | NIMI_KU_SULI)))
def test_AddTokensToMemberFilter(s: str):
PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
assert PuEnKuSuliFilter.filter(s)


@given(st.sampled_from(list(NIMI_LINKU_SANDBOX | NIMI_KU_LILI)))
def test_AddTokensToMemberFilterNegative(s: str):
PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
assert not PuEnKuSuliFilter.filter(s)


@given(
st.sampled_from(
list(
NIMI_PU
| NIMI_KU_SULI
| NIMI_KU_LILI
| NIMI_LINKU_UNCOMMON
| NIMI_LINKU_OBSCURE
| NIMI_LINKU_SANDBOX
),
)
| st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
)
def test_SubTokensFromMemberFilter(s: str):
NimiAlaFilter = NimiLinkuCore(sub=NimiPu.tokens)
# core is a strict subset of pu
# if kin becomes core, needs to be corrected

assert not NimiAlaFilter.filter(s)

0 comments on commit 185b704

Please sign in to comment.