Skip to content

Commit

Permalink
sin: ilo sin li ken poki e nimi lon ilo Filter kepeken sona Linku (fixes
Browse files Browse the repository at this point in the history
 #8)
  • Loading branch information
gregdan3 committed Aug 15, 2024
1 parent cb8e35b commit 1a21c2e
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 121 deletions.
71 changes: 37 additions & 34 deletions src/sonatoki/Filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,34 @@
import re
from abc import ABC, abstractmethod
from copy import deepcopy
from typing import Set, List, Type, Optional
from typing import Set, List, Type, Union, Literal, Optional
from functools import lru_cache as cache # cache comes in 3.9

# PDM
import regex
from typing_extensions import override, deprecated
from typing_extensions import override

# LOCAL
from sonatoki.utils import prep_dictionary
from sonatoki.constants import (
VOWELS,
NIMI_PU,
ALPHABET,
ALL_PUNCT,
ALLOWABLES,
CONSONANTS,
NIMI_UCSUR,
NIMI_KU_LILI,
NIMI_KU_SULI,
NIMI_LINKU_CORE,
NIMI_PU_SYNONYMS,
NIMI_LINKU_COMMON,
FALSE_POS_SYLLABIC,
NIMI_LINKU_OBSCURE,
NIMI_LINKU_SANDBOX,
NOT_IN_PUNCT_CLASS,
NIMI_LINKU_UNCOMMON,
ALL_PUNCT_RANGES_STR,
FALSE_POS_ALPHABETIC,
UCSUR_PUNCT_RANGES_STR,
EMOJI_VARIATION_SELECTOR_RANGES_STR,
LinkuBooks,
LinkuUsageDate,
LinkuUsageCategory,
words_by_tag,
words_by_usage,
)

regex.DEFAULT_VERSION = regex.VERSION1
Expand Down Expand Up @@ -170,40 +167,46 @@ class LongProperName(MinLen, ProperName):
length = 2 # reject "names" of length 1


class NimiPu(MemberFilter):
tokens = prep_dictionary(NIMI_PU)


class NimiPuSynonyms(MemberFilter):
tokens = prep_dictionary(NIMI_PU_SYNONYMS)


class NimiKuSuli(MemberFilter):
tokens = prep_dictionary(NIMI_KU_SULI)


class NimiKuLili(MemberFilter):
tokens = prep_dictionary(NIMI_KU_LILI)
class NimiLinkuByUsage:
def __new__(
cls,
usage: int,
date: Optional[LinkuUsageDate] = None,
) -> Type[MemberFilter]:
words = words_by_usage(usage, date)

class AnonLinkuMemberFilter(MemberFilter):
tokens = prep_dictionary(words)

class NimiLinkuCore(MemberFilter):
tokens = prep_dictionary(NIMI_LINKU_CORE)
return AnonLinkuMemberFilter


class NimiLinkuCommon(MemberFilter):
tokens = prep_dictionary(NIMI_LINKU_COMMON)
class NimiLinkuByTag:
def __new__(
cls,
tag: Union[Literal["usage_category"], Literal["book"]],
category: Union[LinkuUsageCategory, LinkuBooks],
) -> Type[MemberFilter]:
words = words_by_tag(tag, category)

class AnonLinkuMemberFilter(MemberFilter):
tokens = prep_dictionary(words)

class NimiLinkuUncommon(MemberFilter):
tokens = prep_dictionary(NIMI_LINKU_UNCOMMON)
return AnonLinkuMemberFilter


class NimiLinkuObscure(MemberFilter):
tokens = prep_dictionary(NIMI_LINKU_OBSCURE)
NimiPu = NimiLinkuByTag("book", "pu")
NimiKuSuli = NimiLinkuByTag("book", "ku suli")
NimiKuLili = NimiLinkuByTag("book", "ku lili")
NimiLinkuCore = NimiLinkuByTag("usage_category", "core")
NimiLinkuCommon = NimiLinkuByTag("usage_category", "common")
NimiLinkuUncommon = NimiLinkuByTag("usage_category", "uncommon")
NimiLinkuObscure = NimiLinkuByTag("usage_category", "obscure")
NimiLinkuSandbox = NimiLinkuByTag("usage_category", "sandbox")


class NimiLinkuSandbox(MemberFilter):
tokens = prep_dictionary(NIMI_LINKU_SANDBOX)
class NimiPuSynonyms(MemberFilter):
tokens = prep_dictionary(NIMI_PU_SYNONYMS)


class NimiUCSUR(MemberFilter):
Expand Down
109 changes: 82 additions & 27 deletions src/sonatoki/constants.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,58 @@
# STL
import json
from typing import Set, Dict
from typing import Set, Dict, List, Union, Literal, Optional, TypedDict
from pathlib import Path

LinkuUsageDate = Union[
Literal["2020-04"],
Literal["2021-10"],
Literal["2022-08"],
Literal["2023-09"],
# Literal["2024-09"],
]

LinkuUsageCategory = Union[
Literal["core"],
Literal["common"],
Literal["uncommon"],
Literal["obscure"],
Literal["sandbox"],
]

LinkuBooks = Union[
Literal["pu"],
Literal["ku suli"],
Literal["ku lili"],
Literal["none"],
]

LATEST_DATE = "2023-09"
# hardcoding this seems bad, but it means the parser is stable w.r.t. Linku!


class LinkuWord(TypedDict):
id: str
author_verbatim: str
author_verbatim_source: str
book: str
coined_era: str
coined_year: str
creator: List[str]
ku_data: Dict[str, int]
see_also: List[str]
resources: Dict[str, str]
representations: Dict[str, Union[str, List[str]]]
source_language: str
usage_category: LinkuUsageCategory
word: str
deprecated: bool
etymology: List[Dict[str, str]]
audio: List[Dict[str, str]]
pu_verbatim: Dict[str, str]
usage: Dict[LinkuUsageDate, int]
translations: Dict[str, Dict[str, str]]


# LOCAL
from sonatoki.utils import find_unicode_chars, find_unicode_ranges

Expand Down Expand Up @@ -689,36 +739,49 @@
# NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]


def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
return {d["word"] for d in data.values() if d[key] == value}
def linku_data() -> Dict[str, LinkuWord]:
# NOTE: this does open+read+parse two files each time you construct a filter
# but i expect users to construct filters only at the start of runtime
# there is no reason to waste your RAM by leaving the linku data in it
with open(LINKU) as f:
linku: Dict[str, LinkuWord] = json.loads(f.read())
with open(SANDBOX) as f:
sandbox: Dict[str, LinkuWord] = json.loads(f.read())

return {**linku, **sandbox}


def words_by_tag(tag: str, value: str) -> Set[str]:
data = linku_data()
return {d["word"] for d in data.values() if d[tag] == value}

with open(LINKU) as f:
linku: Dict[str, Dict[str, str]] = json.loads(f.read())
NIMI_PU = category_helper(linku, "book", "pu")
NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}

NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
def words_by_usage(
usage: int,
date: Optional[LinkuUsageDate] = None,
) -> Set[str]:
if not date:
date = LATEST_DATE
data = linku_data()

NIMI_LINKU_CORE = category_helper(linku, "usage_category", "core")
NIMI_LINKU_COMMON = category_helper(linku, "usage_category", "common")
NIMI_LINKU_UNCOMMON = category_helper(linku, "usage_category", "uncommon")
NIMI_LINKU_OBSCURE = category_helper(linku, "usage_category", "obscure")
result: Set[str] = set()
for word in data.values():
usages = word["usage"]
if date in usages and usages[date] >= usage:
result.add(word["word"])

return result


NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}

with open(SANDBOX) as f:
sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}

# with open(SYLLABICS) as f:
# FALSE_POS_SYLLABIC = {line.strip() for line in f}
#
# with open(ALPHABETICS) as f:
# FALSE_POS_ALPHABETIC = {line.strip() for line in f}

del linku
del sandbox

__all__ = [
"ALLOWABLES",
"ALL_PUNCT",
Expand All @@ -727,14 +790,6 @@ def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Se
"CONSONANTS",
"EMOJI_VARIATION_SELECTOR_RANGES",
"EMOJI_VARIATION_SELECTOR_RANGES_STR",
"NIMI_KU_LILI",
"NIMI_KU_SULI",
"NIMI_LINKU_COMMON",
"NIMI_LINKU_CORE",
"NIMI_LINKU_OBSCURE",
"NIMI_LINKU_SANDBOX",
"NIMI_LINKU_UNCOMMON",
"NIMI_PU",
"NIMI_PU_SYNONYMS",
"POSIX_PUNCT",
"POSIX_PUNCT_RANGES",
Expand Down
Loading

0 comments on commit 1a21c2e

Please sign in to comment.