Skip to content

Commit

Permalink
o ken sona pona e toki Inli o ken weka e ona
Browse files Browse the repository at this point in the history
lon la mi kepeken ala lipu suli lon tenpo ni, taso nasin ni li pona tawa
nasin nasa anu seme? pona.
  • Loading branch information
gregdan3 committed Jul 3, 2024
1 parent 7739815 commit dcf9f5d
Show file tree
Hide file tree
Showing 7 changed files with 2,203 additions and 14 deletions.
68 changes: 66 additions & 2 deletions src/sonatoki/__main__.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,45 @@
#!/bin/env python3
# STL
import os
import json
import argparse
from typing import Any, Dict, List
from typing import Any, Set, Dict, List

# PDM
import emoji
import requests

# LOCAL
from sonatoki.utils import find_unicode_ranges
from sonatoki.Filters import (
Or,
LongSyllabic,
NimiLinkuCore,
LongAlphabetic,
NimiLinkuCommon,
NimiLinkuObscure,
NimiLinkuUncommon,
)
from sonatoki.Cleaners import ConsecutiveDuplicates
from sonatoki.constants import (
UCSUR_PUNCT_RANGES,
UNICODE_PUNCT_RANGES,
EMOJI_VARIATION_SELECTOR_RANGES,
)

HERE = os.path.dirname(os.path.realpath(__file__))

UNICODE_DATA = "https://unicode.org/Public/UNIDATA/UnicodeData.txt"

LINKU_WORDS = "https://api.linku.la/v1/words?lang=en"
LINKU_SANDBOX = "https://api.linku.la/v1/sandbox?lang=en"

WORDS_10K = "https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt"
WORDS_25K = "https://raw.githubusercontent.com/dolph/dictionary/master/popular.txt"
WORDS_479K = (
"https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt"
)

HEADERS = { # pretend to be Chrome 121, just in case
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.3"
}
Expand All @@ -40,7 +59,50 @@ def download_json(url: str) -> Dict[str, Any]:


def regen_linku_data():
pass
data = download_json(LINKU_WORDS)
with open(os.path.join(HERE, "linku.json"), "w") as f:
_ = f.write(json.dumps(data))

data = download_json(LINKU_SANDBOX)
with open(os.path.join(HERE, "sandbox.json"), "w") as f:
_ = f.write(json.dumps(data))


def regen_false_negatives():
# TODO: regen from my frequency data where the score is below 0.8?
KnownWords = Or(
NimiLinkuCore,
NimiLinkuCommon,
NimiLinkuUncommon,
NimiLinkuObscure,
)

syllabic_matches: Set[str] = set()
alphabetic_matches: Set[str] = set()
data = download(WORDS_25K)
for word in data.splitlines():
if not word:
continue
word = ConsecutiveDuplicates.clean(word)

if KnownWords.filter(word):
# ignore dictionary
continue
if LongSyllabic.filter(word):
syllabic_matches.add(word)
continue
if LongAlphabetic.filter(word):
alphabetic_matches.add(word)
continue

# TODO: include short matches or no?
with open(os.path.join(HERE, "syllabic.txt"), "w") as f:
syllabic_final = sorted([word + "\n" for word in syllabic_matches])
f.writelines(syllabic_final)

with open(os.path.join(HERE, "alphabetic.txt"), "w") as f:
alphabetic_final = sorted([word + "\n" for word in alphabetic_matches])
f.writelines(alphabetic_final)


def regen_unicode_data():
Expand Down Expand Up @@ -107,6 +169,8 @@ def get_character(data: List[str]):

def main(argv: argparse.Namespace):
regen_unicode_data()
regen_linku_data()
regen_false_negatives()


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit dcf9f5d

Please sign in to comment.