Skip to content

Commit

Permalink
use translations section to create more dictionaries (#107)
Browse files Browse the repository at this point in the history
* rename to main.sh

* groundwork

* mvp

* delete comments

* exit if no translations

* wip

* move to function

* mvp

* use sense as subheading

* glossary only flag

* formatting

* no monolingual translations

* indent

* revert script name to auto.sh

* revert script name to auto.sh

* fixes

* try to use less space

* try matrix

* try matrix

* try matrix

* fix quote

* uncomment release

* vert

* echo

* try softprops again

* try fixing merge ipa and speed up conversion

* fixes
  • Loading branch information
StefanVukovic99 authored Jul 13, 2024
1 parent 614ddba commit cc8d9e3
Show file tree
Hide file tree
Showing 7 changed files with 522 additions and 156 deletions.
154 changes: 123 additions & 31 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@ on:
workflow_dispatch:

jobs:
build:
prepare:
runs-on: ubuntu-latest

outputs:
tag: ${{ steps.tag.outputs.tag }}
languages: ${{ steps.load-languages.outputs.languages }}
isos: ${{ steps.load-languages.outputs.isos }}
calver: ${{ steps.tag.outputs.calver }}
steps:
- name: Checkout repository
uses: actions/checkout@v2
Expand All @@ -16,40 +20,12 @@ jobs:
with:
node-version: '20'

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.9'

- name: Create .env file
run: |
cp .env.example .env
sed -i 's/^DICT_NAME=.*/DICT_NAME=kty/' .env
- name: Install jq
run: sudo apt-get install -y jq

- name: Install gzip
run: sudo apt-get install -y gzip

- name: Install npm dependencies
run: npm install

- name: Run auto.sh script
run: ./auto.sh ? ?

- name: Run merge-ipa
run: node merge-ipa.js

- name: Generate list of .zip files
id: generate_file_list
run: |
find data/language -type f -name '*.zip' > zip_files.txt
cat zip_files.txt
echo "zip_files<<EOF" >> $GITHUB_OUTPUT
cat zip_files.txt >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
- name: Configure git
run: |
git config --global user.name "github-actions[bot]"
Expand All @@ -67,6 +43,14 @@ jobs:
git tag -a $TAG -m "Published version $TAG" ${GITHUB_SHA}
git push origin $TAG
- name: Load Languages
id: load-languages
run: |
languages=$(jq -r '.[].language' languages.json | jq -R -s -c 'split("\n") | map(select(. != ""))')
echo "languages=$languages" >> $GITHUB_OUTPUT
isos=$(jq -r '.[].iso' languages.json | jq -R -s -c 'split("\n") | map(select(. != ""))')
echo "isos=$isos" >> $GITHUB_OUTPUT
- name: Generate markdown table
id: generate_markdown
run: |
Expand All @@ -87,12 +71,120 @@ jobs:
token: ${{ secrets.GITHUB_TOKEN }}
title: "Update downloads.md with list of .zip files"
body: "This PR updates the downloads.md file with a table listing all .zip files."

- name: Release
uses: softprops/action-gh-release@v2
with:
name: ${{ steps.tag.outputs.calver }}
tag_name: ${{ steps.tag.outputs.tag }}
prerelease: true
generate_release_notes: true


convert:
needs: prepare
runs-on: ubuntu-latest
strategy:
max-parallel: 1
matrix:
edition_language: ${{fromJson(needs.prepare.outputs.languages)}}
steps:
- name: Checkout
uses: actions/checkout@v2

- name: Set up Node.js
uses: actions/setup-node@v3
with:
node-version: '20'

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.9'

- name: Create .env file
run: |
cp .env.example .env
sed -i 's/^DICT_NAME=.*/DICT_NAME=kty/' .env
- name: Install jq
run: sudo apt-get install -y jq

- name: Install gzip
run: sudo apt-get install -y gzip

- name: Run auto.sh script
run: ./auto.sh "${{ matrix.edition_language }}" ? ?

- name: Generate list of .zip files
id: generate_file_list
run: |
find data/language -type f -name '*.zip' > zip_files.txt
cat zip_files.txt
echo "zip_files<<EOF" >> $GITHUB_OUTPUT
cat zip_files.txt >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
- name: Upload dictionary files
uses: softprops/action-gh-release@v2
with:
name: ${{ needs.prepare.outputs.calver }}
tag_name: ${{ needs.prepare.outputs.tag }}
prerelease: true
generate_release_notes: true
files: ${{ steps.generate_file_list.outputs.zip_files }}

merge-ipa:
needs: ["prepare", "convert"]
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2

- name: Set up Node.js
uses: actions/setup-node@v3
with:
node-version: '20'

- name: Download IPA dicts
run: |
mapfile -t iso_array < <(echo "$isos" | jq -r '.[]')
supported_editions="de en es fr ru zh"
for source_iso in "${iso_array[@]}"; do
for target_iso in "${iso_array[@]}"; do
filename="kty-${source_iso}-${target_iso}-ipa.zip"
if [ -f "$filename" ]; then
continue
fi
if [[ ! "$supported_editions" == *"$target_iso"* ]]; then
continue
fi
url="https://github.com/themoeway/kaikki-to-yomitan/releases/download/${{needs.prepare.outputs.tag}}/kty-${source_iso}-${target_iso}-ipa.zip"
wget -nv "$url"
done
done
- name: Run merge-ipa
run: node merge-ipa.js

- name: Delete downloaded IPA files
run: rm *.zip

- name: Generate list of .zip files
id: generate_file_list
run: |
find data/language -type f -name '*.zip' > zip_files.txt
cat zip_files.txt
echo "zip_files<<EOF" >> $GITHUB_OUTPUT
cat zip_files.txt >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
- name: Upload dictionary files
uses: softprops/action-gh-release@v2
with:
name: ${{ needs.prepare.outputs.calver }}
tag_name: ${{ needs.prepare.outputs.tag }}
prerelease: true
generate_release_notes: true
files: ${{ steps.generate_file_list.outputs.zip_files }}
10 changes: 5 additions & 5 deletions 2-extract-language.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import json
import os

source_iso = os.environ.get("source_iso")
target_iso = os.environ.get("target_iso")
download_iso = os.environ.get("download_iso")
edition_iso = os.environ.get("edition_iso")

input_file = f"data/kaikki/{target_iso}-extract.jsonl"
output_file = f"data/kaikki/{source_iso}-{target_iso}-extract.jsonl"
input_file = f"data/kaikki/{edition_iso}-extract.jsonl"
output_file = f"data/kaikki/{download_iso}-{edition_iso}-extract.jsonl"

print(f"Reading {input_file} and writing {output_file}...")

Expand All @@ -29,7 +29,7 @@
print(f"Error: no lang_code or redirect in line {line_count}.", obj)
continue

if obj["lang_code"] == source_iso:
if obj["lang_code"] == download_iso:
output_file.write(line)

# Print progress at the specified interval
Expand Down
36 changes: 8 additions & 28 deletions 4-make-yomitan.js
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
const { readFileSync, writeFileSync, existsSync, readdirSync, mkdirSync, createWriteStream, unlinkSync, write } = require('fs');
const { sortTags, writeInBatches, consoleOverwrite, mapJsonReviver, logProgress } = require('./util/util');

const path = require('path');
const date = require('date-and-time');
const now = new Date();
const currentDate = date.format(now, 'YYYY.MM.DD');
const { readFileSync, writeFileSync, existsSync, readdirSync, mkdirSync, unlinkSync } = require('fs');
const { sortTags, writeInBatches, consoleOverwrite,
mapJsonReviver, logProgress, loadJsonArray,
findPartOfSpeech, incrementCounter, currentDate } = require('./util/util');

const {
source_iso,
target_iso,
DEBUG_WORD,
source_iso,
target_iso,
DEBUG_WORD,
DICT_NAME,
tidy_folder: readFolder,
temp_folder: writeFolder
Expand All @@ -31,10 +29,6 @@ if (!existsSync(`data/language/${source_iso}/${target_iso}`)) {
mkdirSync(`data/language/${source_iso}/${target_iso}`, {recursive: true});
}

function loadJsonArray(file) {
return existsSync(file) ? JSON.parse(readFileSync(file)) : [];
}

const targetLanguageTermTags = loadJsonArray(`data/language/target-language-tags/${target_iso}/tag_bank_term.json`);
const languageTermTags = loadJsonArray(`data/language/${source_iso}/${target_iso}/tag_bank_term.json`);
const termTags = [...targetLanguageTermTags, ...languageTermTags];
Expand Down Expand Up @@ -78,16 +72,6 @@ function findTag(tags, tag) {
return result;
}

function findPartOfSpeech(pos) {
for(const posAliases of partsOfSpeech){
if (posAliases.includes(pos)){
return posAliases[0];
}
}
incrementCounter(pos, skippedPartsOfSpeech);
return pos;
}

function findModifiedTag(tag){
let modifiedTag = null;
tagModifiers.forEach((modifier) => {
Expand Down Expand Up @@ -187,7 +171,7 @@ let lastTermBankIndex = 0;
term, // term
reading !== normalizedLemma ? reading : '', // reading
joinedTags, // definition_tags
findPartOfSpeech(pos), // rules
findPartOfSpeech(pos, partsOfSpeech, skippedPartsOfSpeech), // rules
0, // frequency
[gloss], // definitions
0, // sequence
Expand Down Expand Up @@ -499,10 +483,6 @@ function sortBreakdown(obj){
return Object.fromEntries(Object.entries(obj).sort((a, b) => b[1] - a[1]));
}

function incrementCounter(key, counter) {
counter[key] = (counter[key] || 0) + 1;
}

function normalizeOrthography(term) {
switch (source_iso) {
case 'ar':
Expand Down
Loading

0 comments on commit cc8d9e3

Please sign in to comment.