Skip to content

Commit

Permalink
o weka e nimi mute tan nasin ilo Corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
gregdan3 committed Jul 27, 2024
1 parent 6f0c259 commit b29e245
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 11 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "sonatoki"
version = "0.5.2"
version = "0.5.3"
description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
authors = [
{ name = "jan Kekan San (@gregdan3)", email = "[email protected]" },
Expand Down
34 changes: 24 additions & 10 deletions src/sonatoki/Configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,16 +112,30 @@ class IloConfig(TypedDict):
].tokens, # pyright: ignore[reportAttributeAccessIssue]
)
__corpus_tokens_dict -= {
"an",
"i",
"me",
"ne",
"se",
"take",
"ten",
"to",
"u",
"we",
# Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
# In this case, all of these appear more often in English by a factor of at least 10.
"aka", # also known as
"an", # article
"api", # API
"i", # 1st person
"kana", # japanese script
"me", # 1st person
"ne", # "no" in several languages
"nu", # "new", now in dutch
"se", # spanish particle, "see"
"take", # acquire, perhaps forcefully or without permission
"ten", # 10
"to", # to, too
"u", # no u
"we", # 1st person plural
"wi", # wii and discussions of syllables
"sole", # singular, of shoe
# unexplored candidates for removal
# "omen", # ominous
# "papa", # father
# "lo", # "lo" and "loo"
# "ewe", # sheep
# "pa", # father- eh?
}
"""Mimics the previous implementation of ilo pi toki pona taso."""
LazyConfig: IloConfig = {
Expand Down

0 comments on commit b29e245

Please sign in to comment.