Skip to content

Commit

Permalink
add script to merge ipa from all wiktionaries for a language (#6)
Browse files Browse the repository at this point in the history
* wip

* rename

* finish?

* delete empty file
  • Loading branch information
StefanVukovic99 authored Jan 23, 2024
1 parent 9c7ad11 commit 02c810d
Show file tree
Hide file tree
Showing 8 changed files with 148 additions and 54 deletions.
13 changes: 2 additions & 11 deletions 3-tidy-up.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ const {
tidy_folder: writeFolder
} = process.env;

const { sortTags, similarSort } = require('./util/sort-tags');
const { sortTags, similarSort, consoleOverwrite, clearConsoleLine } = require('./util/util');

function isInflectionGloss(glosses) {
if (targetIso === 'en') {
Expand Down Expand Up @@ -151,6 +151,7 @@ function handleLine(line, lemmaDict, formDict, formStuff, automatedForms) {
.filter(sound => sound && sound.ipa)
.map(sound => ({ ipa: sound.ipa, tags: sound.tags || [] }))
.flatMap(ipaObj => typeof ipaObj.ipa === 'string' ? [ipaObj] : ipaObj.ipa.map(ipa => ({ ipa, tags: ipaObj.tags })) )
.filter(ipaObj => ipaObj.ipa)
: [];

let nestedGlossObj = {};
Expand Down Expand Up @@ -321,13 +322,3 @@ lr.on('end', () => {

consoleOverwrite('3-tidy-up.js finished.\n');
});

function clearConsoleLine() {
process.stdout.write('\r\x1b[K'); // \r moves the cursor to the beginning of the line, \x1b[K clears the line
}


function consoleOverwrite(text) {
clearConsoleLine();
process.stdout.write(text);
}
42 changes: 9 additions & 33 deletions 4-make-yomitan.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
const {readFileSync, writeFileSync, existsSync, readdirSync, mkdirSync, createWriteStream, unlinkSync} = require('fs');
const date = require('date-and-time');
const now = new Date();

const {source_iso, target_iso, DEBUG_WORD, DICT_NAME} = process.env;

const currentDate = date.format(now, 'YYYY.MM.DD');

const { sortTags } = require('./util/sort-tags');
const { sortTags, writeInBatches, consoleOverwrite } = require('./util/util');

const {source_iso, target_iso, DEBUG_WORD, DICT_NAME} = process.env;

consoleOverwrite(`4-make-yomitan.js: reading lemmas...`);
const lemmaDict = JSON.parse(readFileSync(`data/tidy/${source_iso}-${target_iso}-lemmas.json`));
Expand All @@ -21,11 +20,11 @@ function loadJson(file) {
return existsSync(file) ? JSON.parse(readFileSync(file)) : [];
}

const commonTermTags = loadJson('data/language/tag_bank_term.json');
const commonTermTags = loadJson(`data/language/${source_iso}/tag_bank_term.json`);
const languageTermTags = loadJson(`data/language/${source_iso}/${target_iso}/tag_bank_term.json`);
const termTags = [...commonTermTags, ...languageTermTags];

const commonIpaTags = loadJson('data/language/tag_bank_ipa.json');
const commonIpaTags = loadJson(`data/language/${source_iso}/tag_bank_ipa.json`);
const languageIpaTags = loadJson(`data/language/${source_iso}/${target_iso}/tag_bank_ipa.json`);
const ipaTags = [...commonIpaTags, ...languageIpaTags];

Expand Down Expand Up @@ -328,7 +327,7 @@ const tempPath = 'data/temp';

const indexJson = {
format: 3,
revision: 'ymt-' + currentDate,
revision: currentDate,
sequenced: true
};

Expand All @@ -342,14 +341,14 @@ for (const folder of folders) {

writeFileSync(`${tempPath}/${folder}/index.json`, JSON.stringify({
...indexJson,
title: `${DICT_NAME}W-${source_iso}-${target_iso}` + (folder === 'dict' ? '' : '-ipa'),
title: `${DICT_NAME}-${source_iso}-${target_iso}` + (folder === 'dict' ? '' : '-ipa'),
}));

writeFileSync(`${tempPath}/${folder}/tag_bank_1.json`, JSON.stringify(Object.values(ymtTags[folder])));

const filename = folder === 'dict' ? 'term_bank_' : 'term_meta_bank_';

writeInBatches(ymt[folder], `${folder}/${filename}`, 25000);
writeInBatches(tempPath, ymt[folder], `${folder}/${filename}`, 25000);
}

console.log('');
Expand All @@ -367,22 +366,7 @@ writeFileSync(`data/language/${source_iso}/${target_iso}/skippedIpaTags.json`, J

writeFileSync(`data/language/${source_iso}/${target_iso}/skippedTermTags.json`, JSON.stringify(sortBreakdown(skippedTermTags), null, 2));

console.log('4-make-yomitan.js: Done!');

function writeInBatches(inputArray, filenamePrefix, batchSize = 100000) {
consoleOverwrite(`Writing ${inputArray.length.toLocaleString()} entries of ${filenamePrefix}...`);

let bankIndex = 0;

while (inputArray.length > 0) {
const batch = inputArray.splice(0, batchSize);
bankIndex += 1;
const filename = `${tempPath}/${filenamePrefix}${bankIndex}.json`;
const content = JSON.stringify(batch, null, 2);

writeFileSync(filename, content);
}
}
console.log('4-make-yomitan.js: Done!')

function escapeRegExp(text) {
return text.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, '\\$&');
Expand Down Expand Up @@ -416,11 +400,3 @@ function normalizeOrthography(term) {
}
}

function clearConsoleLine() {
process.stdout.write('\r\x1b[K'); // \r moves the cursor to the beginning of the line, \x1b[K clears the line
}

function consoleOverwrite(text) {
clearConsoleLine();
process.stdout.write(text);
}
4 changes: 2 additions & 2 deletions auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,8 @@ for entry in "${entries[@]}"; do
rm -f "$kaikki_file"
fi

dict_file="${DICT_NAME}W-$source_iso-$target_iso.zip"
ipa_file="${DICT_NAME}W-$source_iso-$target_iso-ipa.zip"
dict_file="${DICT_NAME}-$source_iso-$target_iso.zip"
ipa_file="${DICT_NAME}-$source_iso-$target_iso-ipa.zip"

# Step 5: Create Yomitan files
if \
Expand Down
6 changes: 2 additions & 4 deletions data/language/en/tag_bank_ipa.json
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
[
["🏴󠁧󠁢󠁥󠁮󠁧󠁿", "dialect", 0, "Received-Pronunciation", 0],
["🇬🇧", "dialect", 0, "UK", 0],
["🇺🇸", "dialect", 0, "US", 0],
["🇺🇸", "dialect", 0, ["US", "General-American"], 0],
["🇦🇺", "dialect", 0, "General-Australian", 0],
["🇺🇸", "dialect", 0, "General-American", 0],
["🇳🇿", "dialect", 0, "New-Zealand", 0],
["🇨🇦", "dialect", 0, "Canada", 0],
["🏴󠁧󠁢󠁳󠁣󠁴󠁿", "dialect", 0, "Scotland", 0],
["🏴󠁧󠁢󠁳󠁣󠁴󠁿", "dialect", 0, "Scottish", 0],
["🏴󠁧󠁢󠁳󠁣󠁴󠁿", "dialect", 0, ["Scotland", "Scottish"], 0],
["🇮🇪", "dialect", 0, "Ireland", 0],
["ccm", "dialect", 0, "cot-caught-merger", 0]
]
103 changes: 103 additions & 0 deletions merge-ipa.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
const StreamZip = require('node-stream-zip');
const { execSync } = require('child_process');
const { readdirSync, existsSync, readFileSync, writeFileSync, unlinkSync } = require('fs');
const { writeInBatches } = require('./util/util');
const date = require('date-and-time');
const now = new Date();

async function main(){
const languages = JSON.parse(readFileSync('languages.json', 'utf8'));

for (const {iso: sourceIso} of languages){
const globalIpa = {};
let globalTags = [];

for (const {iso: targetIso} of languages){
let localIpa = [];
let localTags = [];

const file = `data/language/${sourceIso}/${targetIso}/kty-${sourceIso}-${targetIso}-ipa.zip`;
if (existsSync(file)) {
console.log("found", file);
const zip = new StreamZip.async({ file });
const files = Object.keys(await zip.entries());
for (const file of files) {
if(file.startsWith("term_meta_bank_")){
const data = await zip.entryData(file);
const json = JSON.parse(data);
localIpa = localIpa.concat(json);
}
if(file.startsWith("tag_bank_")){
const data = await zip.entryData(file);
const json = JSON.parse(data);
localTags = localTags.concat(json);
}
}

console.log("localIpa", localIpa.length);
console.log("localTags", localTags.length);

await zip.close();

for (const local of localIpa) {
const [term] = local
if(!globalIpa[term]){
globalIpa[term] = local;
} else {
const existingIpas = globalIpa[term][2]['transcriptions']
const newIpas = local[2]['transcriptions']

for (const newIpa of newIpas) {
const existingIpa = existingIpas.find(({ipa}) => ipa === newIpa.ipa);
if(!existingIpa){
existingIpas.push(newIpa);
const newTags = newIpa.tags.map(tag => localTags.find(([tagId]) => tagId === tag));
for (const newTag of newTags) {
if(!globalTags.find(([tagId]) => tagId === newTag[0])){
globalTags.push(newTag);
}
}
} else {
const newTags = newIpa.tags.filter(tag => !existingIpa.tags.includes(tag));
for (const newTag of newTags) {
existingIpa.tags.push(newTag);
const fullTag = localTags.find(([tagId]) => tagId === newTag);
if(!globalTags.find(([tagId]) => tagId === fullTag[0])){
globalTags.push(fullTag);
}
}
}
}
}
}
}
}

const globalIpaLength = Object.keys(globalIpa).length;
if(globalIpaLength) console.log("globalIpa", globalIpaLength);
const globalTagsLength = globalTags.length;
if(globalTagsLength) console.log("globalTags", globalTagsLength);

const globalIndex = {
"format": 3,
"revision": date.format(now, 'YYYY.MM.DD'),
"sequenced": true,
"title": `kty-${sourceIso}-ipa`
}

if(globalIpaLength){

for (const file of readdirSync('data/temp/ipa')) {
unlinkSync(`data/temp/ipa/${file}`);
}

writeFileSync(`data/temp/ipa/index.json`, JSON.stringify(globalIndex, null, 4));
writeInBatches('data/temp/ipa', Object.values(globalIpa), 'term_meta_bank_', 500000);
writeInBatches('data/temp/ipa', globalTags, 'tag_bank_', 50000);

execSync(`zip -j data/language/${sourceIso}/kty-${sourceIso}-ipa.zip data/temp/ipa/*`);
}
}
}

main()
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
"dependencies": {
"archiver": "^6.0.1",
"date-and-time": "^2.4.2",
"line-by-line": "^0.1.6"
"line-by-line": "^0.1.6",
"node-stream-zip": "^1.15.0"
},
"description": "Converts Kaikki JSON to Yomitan compatible dictionary.",
"devDependencies": {
Expand Down
2 changes: 1 addition & 1 deletion util/kaikki-breakdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@
# Add labels and title
plt.xlabel("Source Language (headwords in this language)", fontsize=8)
plt.ylabel("Target Language (glosses in this language)", fontsize=8)
plt.title("ymtW", fontsize=12)
plt.title("kaikki-to-yomitan", fontsize=12)

# Save the plot with a higher resolution
plt.savefig("heatmap.png", dpi=300)
29 changes: 27 additions & 2 deletions util/sort-tags.js → util/util.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
const { readFileSync } = require('fs');
const { readFileSync, writeFileSync } = require('fs');

const tagOrder = JSON.parse(readFileSync('data/language/tag_order.json'));

Expand Down Expand Up @@ -56,4 +56,29 @@ function similarSort(tags) {
});
}

module.exports = { sortTags, similarSort };

function writeInBatches(tempPath, inputArray, filenamePrefix, batchSize = 100000) {
consoleOverwrite(`Writing ${inputArray.length.toLocaleString()} entries of ${filenamePrefix}...`);

let bankIndex = 0;

while (inputArray.length > 0) {
const batch = inputArray.splice(0, batchSize);
bankIndex += 1;
const filename = `${tempPath}/${filenamePrefix}${bankIndex}.json`;
const content = JSON.stringify(batch, null, 2);

writeFileSync(filename, content);
}
}

function clearConsoleLine() {
process.stdout.write('\r\x1b[K'); // \r moves the cursor to the beginning of the line, \x1b[K clears the line
}

function consoleOverwrite(text) {
clearConsoleLine();
process.stdout.write(text);
}

module.exports = { sortTags, similarSort, writeInBatches, consoleOverwrite, clearConsoleLine };

0 comments on commit 02c810d

Please sign in to comment.