use translations section to create more dictionaries (#107)

* rename to main.sh * groundwork * mvp * delete comments * exit if no translations * wip * move to function * mvp * use sense as subheading * glossary only flag * formatting * no monolingual translations * indent * revert script name to auto.sh * revert script name to auto.sh * fixes * try to use less space * try matrix * try matrix * try matrix * fix quote * uncomment release * vert * echo * try softprops again * try fixing merge ipa and speed up conversion * fixes
yomidevs · Jul 13, 2024 · cc8d9e3 · cc8d9e3
1 parent 614ddba
commit cc8d9e3
Show file tree

Hide file tree

Showing 7 changed files with 522 additions and 156 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -4,9 +4,13 @@ on:
   workflow_dispatch:
 
 jobs:
-  build:
+  prepare:
     runs-on: ubuntu-latest
-
+    outputs:
+      tag: ${{ steps.tag.outputs.tag }}
+      languages: ${{ steps.load-languages.outputs.languages }}
+      isos: ${{ steps.load-languages.outputs.isos }}
+      calver: ${{ steps.tag.outputs.calver }}
     steps:
       - name: Checkout repository
         uses: actions/checkout@v2
@@ -16,40 +20,12 @@ jobs:
         with:
           node-version: '20'
 
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.9'
-
-      - name: Create .env file
-        run: |
-          cp .env.example .env
-          sed -i 's/^DICT_NAME=.*/DICT_NAME=kty/' .env
-
       - name: Install jq
         run: sudo apt-get install -y jq
 
-      - name: Install gzip
-        run: sudo apt-get install -y gzip
-
       - name: Install npm dependencies
         run: npm install
 
-      - name: Run auto.sh script
-        run: ./auto.sh ? ?
-
-      - name: Run merge-ipa
-        run: node merge-ipa.js
-
-      - name: Generate list of .zip files
-        id: generate_file_list
-        run: |
-          find data/language -type f -name '*.zip' > zip_files.txt
-          cat zip_files.txt
-          echo "zip_files<<EOF" >> $GITHUB_OUTPUT
-          cat zip_files.txt >> $GITHUB_OUTPUT
-          echo "EOF" >> $GITHUB_OUTPUT
-
       - name: Configure git
         run: |
           git config --global user.name "github-actions[bot]"
@@ -67,6 +43,14 @@ jobs:
           git tag -a $TAG -m "Published version $TAG" ${GITHUB_SHA}
           git push origin $TAG
 
+      - name: Load Languages
+        id: load-languages
+        run: |
+          languages=$(jq -r '.[].language' languages.json | jq -R -s -c 'split("\n") | map(select(. != ""))')
+          echo "languages=$languages" >> $GITHUB_OUTPUT
+          isos=$(jq -r '.[].iso' languages.json | jq -R -s -c 'split("\n") | map(select(. != ""))')
+          echo "isos=$isos" >> $GITHUB_OUTPUT
+      
       - name: Generate markdown table
         id: generate_markdown
         run: |
@@ -87,12 +71,120 @@ jobs:
           token: ${{ secrets.GITHUB_TOKEN }}
           title: "Update downloads.md with list of .zip files"
           body: "This PR updates the downloads.md file with a table listing all .zip files."
-
+      
       - name: Release
         uses: softprops/action-gh-release@v2
         with:
           name: ${{ steps.tag.outputs.calver }}
           tag_name: ${{ steps.tag.outputs.tag }}
           prerelease: true
           generate_release_notes: true
+
+
+  convert:
+    needs: prepare
+    runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 1
+      matrix:
+        edition_language: ${{fromJson(needs.prepare.outputs.languages)}}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v3
+        with:
+          node-version: '20'
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.9'
+
+      - name: Create .env file
+        run: |
+          cp .env.example .env
+          sed -i 's/^DICT_NAME=.*/DICT_NAME=kty/' .env
+
+      - name: Install jq
+        run: sudo apt-get install -y jq
+
+      - name: Install gzip
+        run: sudo apt-get install -y gzip
+
+      - name: Run auto.sh script
+        run: ./auto.sh "${{ matrix.edition_language }}" ? ?
+
+      - name: Generate list of .zip files
+        id: generate_file_list
+        run: |
+          find data/language -type f -name '*.zip' > zip_files.txt
+          cat zip_files.txt
+          echo "zip_files<<EOF" >> $GITHUB_OUTPUT
+          cat zip_files.txt >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+
+      - name: Upload dictionary files
+        uses: softprops/action-gh-release@v2
+        with:
+          name: ${{ needs.prepare.outputs.calver }}
+          tag_name: ${{ needs.prepare.outputs.tag }}
+          prerelease: true
+          generate_release_notes: true
           files: ${{ steps.generate_file_list.outputs.zip_files }}
+
+  merge-ipa:
+    needs: ["prepare", "convert"]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v3
+        with:
+          node-version: '20'
+
+      - name: Download IPA dicts
+        run: |
+          mapfile -t iso_array < <(echo "$isos" | jq -r '.[]')
+          supported_editions="de en es fr ru zh"
+          for source_iso in "${iso_array[@]}"; do
+            for target_iso in "${iso_array[@]}"; do
+              filename="kty-${source_iso}-${target_iso}-ipa.zip"
+              if [ -f "$filename" ]; then
+                  continue
+              fi
+
+              if [[ ! "$supported_editions" == *"$target_iso"* ]]; then
+                  continue
+              fi
+              url="https://github.com/themoeway/kaikki-to-yomitan/releases/download/${{needs.prepare.outputs.tag}}/kty-${source_iso}-${target_iso}-ipa.zip"
+              wget -nv "$url"
+            done
+          done
+
+      - name: Run merge-ipa
+        run: node merge-ipa.js
+
+      - name: Delete downloaded IPA files
+        run: rm *.zip
+
+      - name: Generate list of .zip files
+        id: generate_file_list
+        run: |
+          find data/language -type f -name '*.zip' > zip_files.txt
+          cat zip_files.txt
+          echo "zip_files<<EOF" >> $GITHUB_OUTPUT
+          cat zip_files.txt >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+      
+      - name: Upload dictionary files
+        uses: softprops/action-gh-release@v2
+        with:
+          name: ${{ needs.prepare.outputs.calver }}
+          tag_name: ${{ needs.prepare.outputs.tag }}
+          prerelease: true
+          generate_release_notes: true
+          files: ${{ steps.generate_file_list.outputs.zip_files }}
diff --git a/2-extract-language.py b/2-extract-language.py
@@ -1,11 +1,11 @@
 import json
 import os
 
-source_iso = os.environ.get("source_iso")
-target_iso = os.environ.get("target_iso")
+download_iso = os.environ.get("download_iso")
+edition_iso = os.environ.get("edition_iso")
 
-input_file = f"data/kaikki/{target_iso}-extract.jsonl"
-output_file = f"data/kaikki/{source_iso}-{target_iso}-extract.jsonl"
+input_file = f"data/kaikki/{edition_iso}-extract.jsonl"
+output_file = f"data/kaikki/{download_iso}-{edition_iso}-extract.jsonl"
 
 print(f"Reading {input_file} and writing {output_file}...")
 
@@ -29,7 +29,7 @@
                 print(f"Error: no lang_code or redirect in line {line_count}.", obj)
             continue
 
-        if obj["lang_code"] == source_iso:
+        if obj["lang_code"] == download_iso:
             output_file.write(line)
 
         # Print progress at the specified interval

diff --git a/4-make-yomitan.js b/4-make-yomitan.js
@@ -1,15 +1,13 @@
-const { readFileSync, writeFileSync, existsSync, readdirSync, mkdirSync, createWriteStream, unlinkSync, write } = require('fs');
-const { sortTags, writeInBatches, consoleOverwrite, mapJsonReviver, logProgress } = require('./util/util');
-
 const path = require('path');
-const date = require('date-and-time');
-const now = new Date();
-const currentDate = date.format(now, 'YYYY.MM.DD');
+const { readFileSync, writeFileSync, existsSync, readdirSync, mkdirSync, unlinkSync } = require('fs');
+const { sortTags, writeInBatches, consoleOverwrite, 
+    mapJsonReviver, logProgress, loadJsonArray, 
+    findPartOfSpeech, incrementCounter, currentDate } = require('./util/util');
 
 const {
-    source_iso, 
-    target_iso, 
-    DEBUG_WORD, 
+    source_iso,
+    target_iso,
+    DEBUG_WORD,
     DICT_NAME,
     tidy_folder: readFolder,
     temp_folder: writeFolder
@@ -31,10 +29,6 @@ if (!existsSync(`data/language/${source_iso}/${target_iso}`)) {
     mkdirSync(`data/language/${source_iso}/${target_iso}`, {recursive: true});
 }
 
-function loadJsonArray(file) {
-    return existsSync(file) ? JSON.parse(readFileSync(file)) : [];
-}
-
 const targetLanguageTermTags = loadJsonArray(`data/language/target-language-tags/${target_iso}/tag_bank_term.json`);
 const languageTermTags = loadJsonArray(`data/language/${source_iso}/${target_iso}/tag_bank_term.json`);
 const termTags = [...targetLanguageTermTags, ...languageTermTags];
@@ -78,16 +72,6 @@ function findTag(tags, tag) {
     return result;
 }
 
-function findPartOfSpeech(pos) {
-    for(const posAliases of partsOfSpeech){
-        if (posAliases.includes(pos)){
-            return posAliases[0];
-        }
-    }
-    incrementCounter(pos, skippedPartsOfSpeech);
-    return pos;
-}
-
 function findModifiedTag(tag){
     let modifiedTag = null;
     tagModifiers.forEach((modifier) => {
@@ -187,7 +171,7 @@ let lastTermBankIndex = 0;
                                     term, // term
                                     reading !== normalizedLemma ? reading : '', // reading
                                     joinedTags, // definition_tags
-                                    findPartOfSpeech(pos), // rules
+                                    findPartOfSpeech(pos, partsOfSpeech, skippedPartsOfSpeech), // rules
                                     0, // frequency
                                     [gloss], // definitions
                                     0, // sequence
@@ -499,10 +483,6 @@ function sortBreakdown(obj){
     return Object.fromEntries(Object.entries(obj).sort((a, b) => b[1] - a[1]));
 }
 
-function incrementCounter(key, counter) {
-    counter[key] = (counter[key] || 0) + 1;
-}
-
 function normalizeOrthography(term) {
     switch (source_iso) {
         case 'ar':