Skip to content

Commit

Permalink
📚 integrate language list with converter
Browse files Browse the repository at this point in the history
  • Loading branch information
StefanVukovic99 committed Nov 17, 2023
1 parent 2522713 commit 4e3ffe0
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 86 deletions.
2 changes: 1 addition & 1 deletion 5-make-yomichan.js → 5-make-yezichak.js
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ writeFileSync(`data/language/${language_short}/skippedIpaTags.json`, JSON.string
console.log('total tagged terms', taggedTermCount, 'skipped term tags', Object.values(skippedTermTags).reduce((a, b) => a + b, 0));
writeFileSync(`data/language/${language_short}/skippedTermTags.json`, JSON.stringify(sortBreakdown(skippedTermTags), null, 2));

console.log('5-make-yomichan.js: Done!');
console.log('5-make-yezichak.js: Done!');

function writeInBatches(inputArray, filenamePrefix, batchSize = 100000) {
let bankIndex = 0;
Expand Down
196 changes: 116 additions & 80 deletions auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,115 +7,151 @@ export DEBUG_WORD
export OPENSUBS_PATH
export DICT_NAME

force_run=false

# Check for the language and language_short arguments
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Usage: $0 <language> <language_short> [-f|--force]"
if [ -z "$1" ]; then
echo "Usage: $0 <language> [f][d][a]"
exit 1
fi

# Check for the force flag
if [ "$3" = "-f" ] || [ "$3" = "--force" ]; then
force_run=true
fi

export language="$1"
export language_short="$2"
# Parse flags
force_run=false
all_languages=false
redownload=false

flags=('f' 'd' 'a')

for flag in "${flags[@]}"; do
case "$2" in
*"$flag"*)
case "$flag" in
'f') force_run=true ;;
'd') redownload=true ;;
'a') all_languages=true ;;
esac
;;
esac
done

echo "force_run: $force_run"
echo "redownload: $redownload"
echo "all_languages: $all_languages"

# Step 1: Install dependencies
npm i

# Step 2: Run create-folder.js with the language argument
node 1-create-folders.js

# Calculate URL
filename="kaikki.org-dictionary-$language.json"
if [ "$language" = "Serbo-Croatian" ]; then
filename="kaikki.org-dictionary-SerboCroatian.json"
fi

export filename
languages=$(jq '.' ../ext/js/language/languages.json)

url="https://kaikki.org/dictionary/$language/$filename"
export lang="$1"

# Step 3: Download JSON data if it doesn't exist
if [ ! -f "data/kaikki/$filename" ]; then
echo "Downloading $filename from $url"
wget "$url"
# Iterate over each object in the array
for entry in $(echo "${languages}" | jq -c '.[]'); do
# Extract values from the object
iso=$(echo "${entry}" | jq -r '.iso')
language=$(echo "${entry}" | jq -r '.language')
flag=$(echo "${entry}" | jq -r '.flag')

mv $filename "data/kaikki/"
else
echo "Kaikki dict already exists. Skipping download."
fi
if [ "$language" != "$lang" ] && [ "$all_languages" = false ]; then
continue
fi

# Step 4: Run tidy-up.js if the tidy files don't exist
if [ ! -f "data/tidy/$language_short-forms.json" ] || [ ! -f "data/tidy/$language_short-lemmas.json" ] || [ "$force_run" = true ]; then
echo "Tidying up $filename"
node --max-old-space-size=4096 2-tidy-up.js
else
echo "Tidy file already exists. Skipping tidying."
fi
echo "PROCESSING $language ------------------------------------"

export language="$language"
export language_short="$iso"

# Calculate URL
filename="kaikki.org-dictionary-$language.json"
if [ "$language" = "Serbo-Croatian" ]; then
filename="kaikki.org-dictionary-SerboCroatian.json"
fi

# Step 5 (optional): Create an array of sentences
if [ ! -f "data/sentences/$language_short-sentences.json" ] || [ "$force_run" = true ]; then
if [ -d "$OPENSUBS_PATH" ]; then
echo "Creating sentences file"
python3 3-opensubs-to-freq.py
export filename

url="https://kaikki.org/dictionary/$language/$filename"

# Step 3: Download JSON data if it doesn't exist
if [ ! -f "data/kaikki/$filename" ] || [ "$redownload" = true ]; then
echo "Downloading $filename from $url"
wget "$url"

mv $filename "data/kaikki/"
else
echo "OpenSubtitles path not found. Skipping sentence creation."
echo "Kaikki dict already exists. Skipping download."
fi
else
echo "Sentences file already exists. Skipping sentence creation."
fi

# Step 6: Create a frequency list
if [ ! -f "data/freq/$language_short-freq.json" ] || [ "$force_run" = true ]; then
echo "Creating frequency file"
node 4-create-freq.js
else
echo "Freq file already exists. Skipping freq creation."
fi
# Step 4: Run tidy-up.js if the tidy files don't exist
if [ ! -f "data/tidy/$language_short-forms.json" ] || [ ! -f "data/tidy/$language_short-lemmas.json" ] || [ "$force_run" = true ]; then
echo "Tidying up $filename"
node --max-old-space-size=4096 2-tidy-up.js
else
echo "Tidy file already exists. Skipping tidying."
fi

dict_file="$DICT_NAME-dict-$language_short.zip"
ipa_file="$DICT_NAME-ipa-$language_short.zip"
freq_file="$DICT_NAME-freq-$language_short.zip"
# Step 5 (optional): Create an array of sentences
if [ ! -f "data/sentences/$language_short-sentences.json" ] || [ "$force_run" = true ]; then
if [ -d "$OPENSUBS_PATH" ]; then
echo "Creating sentences file"
python3 3-opensubs-to-freq.py
else
echo "OpenSubtitles path not found. Skipping sentence creation."
fi
else
echo "Sentences file already exists. Skipping sentence creation."
fi

# Step 7: Create Yomichan files
if [ ! -f "$dict_file" ] || [ ! -f "$ipa_file" ] || [ "$force_run" = true ]; then
echo "Creating Yomichan dict and IPA files"
if node 5-make-yomichan.js; then
zip -j "$dict_file" data/temp/dict/index.json data/temp/dict/tag_bank_1.json data/temp/dict/term_bank_*.json
zip -j "$ipa_file" data/temp/ipa/index.json data/temp/ipa/tag_bank_1.json data/temp/ipa/term_meta_bank_*.json
# Step 6: Create a frequency list
if [ ! -f "data/freq/$language_short-freq.json" ] || [ "$force_run" = true ]; then
echo "Creating frequency file"
node 4-create-freq.js
else
echo "Error: Yomichan generation script failed."
echo "Freq file already exists. Skipping freq creation."
fi
else
echo "Yomichan dict already exists. Skipping Yomichan creation."
fi

# Step 8: Convert frequency list to rank-based Yomichan format
if [ ! -f "$freq_file" ] || [ "$force_run" = true ]; then
echo "Creating Yomichan freq files"
if python3 6-freq-to-rank.py; then
zip -j "$freq_file" data/temp/freq/index.json data/temp/freq/term_meta_bank_*.json
dict_file="$DICT_NAME-dict-$language_short.zip"
ipa_file="$DICT_NAME-ipa-$language_short.zip"
freq_file="$DICT_NAME-freq-$language_short.zip"

# Step 7: Create Yezichak files
if [ ! -f "$dict_file" ] || [ ! -f "$ipa_file" ] || [ "$force_run" = true ]; then
echo "Creating Yezichak dict and IPA files"
if node 5-make-yezichak.js; then
zip -j "$dict_file" data/temp/dict/index.json data/temp/dict/tag_bank_1.json data/temp/dict/term_bank_*.json
zip -j "$ipa_file" data/temp/ipa/index.json data/temp/ipa/tag_bank_1.json data/temp/ipa/term_meta_bank_*.json
else
echo "Error: Yezichak generation script failed."
fi
else
echo "Error: Frequency to rank conversion script failed."
echo "Yezichak dict already exists. Skipping Yezichak creation."
fi
else
echo "Yomichan freq already exists. Skipping Yomichan creation."
fi

if [ -f "$dict_file" ]; then
mv "$dict_file" "data/language/$language_short/"
fi
# Step 8: Convert frequency list to rank-based Yezichak format
if [ ! -f "$freq_file" ] || [ "$force_run" = true ]; then
echo "Creating Yezichak freq files"
if python3 6-freq-to-rank.py; then
zip -j "$freq_file" data/temp/freq/index.json data/temp/freq/term_meta_bank_*.json
else
echo "Error: Frequency to rank conversion script failed."
fi
else
echo "Yezichak freq already exists. Skipping Yezichak creation."
fi

if [ -f "$ipa_file" ]; then
mv "$ipa_file" "data/language/$language_short/"
fi
if [ -f "$dict_file" ]; then
mv "$dict_file" "data/language/$language_short/"
fi

if [ -f "$freq_file" ]; then
mv "$freq_file" "data/language/$language_short/"
fi
if [ -f "$ipa_file" ]; then
mv "$ipa_file" "data/language/$language_short/"
fi

if [ -f "$freq_file" ]; then
mv "$freq_file" "data/language/$language_short/"
fi

echo "FINISHED $language ------------------------------------"
done
echo "All done!"
12 changes: 7 additions & 5 deletions instructions.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@

2. create a .env file based on .env.example

3. run ./auto.sh German de
3. if your language is not in /ext/js/language/languages.json, add it

4. if everything runs, dictionaries should be in data/language/de
4. run ./auto.sh German

5. (optional) data/language/de should contain files with skipped tags for ipa and terms,
you may want to add some to tag_bank json file and rerun ./auto.sh
5. if everything runs, dictionaries should be in data/language/de

6. import the zipped dictionaries into yezichak in your browser
6. (optional) data/language/de should contain files with skipped tags for ipa and terms,
you may want to add some to tag_bank_ipa.json or tag_bank_term.json and rerun ./auto.sh

7. import the zipped dictionaries into yezichak in your browser

0 comments on commit 4e3ffe0

Please sign in to comment.