o ken sona pona e toki Inli o ken weka e ona

lon la mi kepeken ala lipu suli lon tenpo ni, taso nasin ni li pona tawa nasin nasa anu seme? pona.
gregdan3 · Jul 3, 2024 · dcf9f5d · dcf9f5d
1 parent 7739815
commit dcf9f5d
Show file tree

Hide file tree

Showing 7 changed files with 2,203 additions and 14 deletions.
diff --git a/src/sonatoki/__main__.py b/src/sonatoki/__main__.py
@@ -1,26 +1,45 @@
 #!/bin/env python3
 # STL
+import os
 import json
 import argparse
-from typing import Any, Dict, List
+from typing import Any, Set, Dict, List
 
 # PDM
 import emoji
 import requests
 
 # LOCAL
 from sonatoki.utils import find_unicode_ranges
+from sonatoki.Filters import (
+    Or,
+    LongSyllabic,
+    NimiLinkuCore,
+    LongAlphabetic,
+    NimiLinkuCommon,
+    NimiLinkuObscure,
+    NimiLinkuUncommon,
+)
+from sonatoki.Cleaners import ConsecutiveDuplicates
 from sonatoki.constants import (
     UCSUR_PUNCT_RANGES,
     UNICODE_PUNCT_RANGES,
     EMOJI_VARIATION_SELECTOR_RANGES,
 )
 
+HERE = os.path.dirname(os.path.realpath(__file__))
+
 UNICODE_DATA = "https://unicode.org/Public/UNIDATA/UnicodeData.txt"
 
 LINKU_WORDS = "https://api.linku.la/v1/words?lang=en"
 LINKU_SANDBOX = "https://api.linku.la/v1/sandbox?lang=en"
 
+WORDS_10K = "https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt"
+WORDS_25K = "https://raw.githubusercontent.com/dolph/dictionary/master/popular.txt"
+WORDS_479K = (
+    "https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt"
+)
+
 HEADERS = {  # pretend to be Chrome 121, just in case
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.3"
 }
@@ -40,7 +59,50 @@ def download_json(url: str) -> Dict[str, Any]:
 
 
 def regen_linku_data():
-    pass
+    data = download_json(LINKU_WORDS)
+    with open(os.path.join(HERE, "linku.json"), "w") as f:
+        _ = f.write(json.dumps(data))
+
+    data = download_json(LINKU_SANDBOX)
+    with open(os.path.join(HERE, "sandbox.json"), "w") as f:
+        _ = f.write(json.dumps(data))
+
+
+def regen_false_negatives():
+    # TODO: regen from my frequency data where the score is below 0.8?
+    KnownWords = Or(
+        NimiLinkuCore,
+        NimiLinkuCommon,
+        NimiLinkuUncommon,
+        NimiLinkuObscure,
+    )
+
+    syllabic_matches: Set[str] = set()
+    alphabetic_matches: Set[str] = set()
+    data = download(WORDS_25K)
+    for word in data.splitlines():
+        if not word:
+            continue
+        word = ConsecutiveDuplicates.clean(word)
+
+        if KnownWords.filter(word):
+            # ignore dictionary
+            continue
+        if LongSyllabic.filter(word):
+            syllabic_matches.add(word)
+            continue
+        if LongAlphabetic.filter(word):
+            alphabetic_matches.add(word)
+            continue
+
+    # TODO: include short matches or no?
+    with open(os.path.join(HERE, "syllabic.txt"), "w") as f:
+        syllabic_final = sorted([word + "\n" for word in syllabic_matches])
+        f.writelines(syllabic_final)
+
+    with open(os.path.join(HERE, "alphabetic.txt"), "w") as f:
+        alphabetic_final = sorted([word + "\n" for word in alphabetic_matches])
+        f.writelines(alphabetic_final)
 
 
 def regen_unicode_data():
@@ -107,6 +169,8 @@ def get_character(data: List[str]):
 
 def main(argv: argparse.Namespace):
     regen_unicode_data()
+    regen_linku_data()
+    regen_false_negatives()
 
 
 if __name__ == "__main__":