Skip to content

Commit

Permalink
📚 add dict converter
Browse files Browse the repository at this point in the history
  • Loading branch information
StefanVukovic99 committed Nov 17, 2023
0 parents commit 88fd8cf
Show file tree
Hide file tree
Showing 21 changed files with 2,539 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
MAX_SENTENCES=5000000
DEBUG_WORD=sehen
OPENSUBS_PATH=/path/to/opensubs
DICT_NAME=abc
14 changes: 14 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
*.json
!tag_bank_term.json
!tag_bank_ipa.json
!package.json

*.zip

*.txt
!instructions.txt

.env

__pycache__
node_modules
35 changes: 35 additions & 0 deletions 1-create-folders.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright (C) 2023 Yezichak Authors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
const {mkdirSync} = require('fs');

const folders = [
'freq',
'freq/metadata',
'kaikki',
'sentences',
'tidy',
'ipa',
'language',
'temp',
'temp/dict',
'temp/freq',
'temp/ipa'
];

for (const folder of folders) {
mkdirSync(`data/${folder}`, {recursive: true});
}
243 changes: 243 additions & 0 deletions 2-tidy-up.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
/*
* Copyright (C) 2023 Yezichak Authors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
const { writeFileSync } = require('fs');

const LineByLineReader = require('line-by-line');

const { language_short, DEBUG_WORD, filename } = process.env;

const lr = new LineByLineReader(`data/kaikki/${filename}`);

const lemmaDict = {};
const formDict = {};

const formStuff = [];
const automatedForms = {};

const blacklistedTags = [
'inflection-template',
'table-tags',
'nominative',
'canonical',
'class',
'error-unknown-tag',
'error-unrecognized-form',
'infinitive',
'includes-article',
'obsolete',
'archaic',
];

const uniqueTags = [];

lr.on('line', (line) => {
if (line) {
const { word, pos, senses, sounds = [], forms } = JSON.parse(line);

if (forms) {
for (const { form, tags } of forms) {
if (form && tags && !tags.some(value => blacklistedTags.includes(value))) {
for (const tag of tags) {
if (!uniqueTags.includes(tag)) {
uniqueTags.push(tag);
}
}
automatedForms[form] ??= {};
automatedForms[form][word] ??= {};
automatedForms[form][word][pos] ??= [];

automatedForms[form][word][pos].push(tags.join(' '));
}
}
}

let ipa = sounds
.filter(sound => sound?.ipa)
.map(sound => {
if(DEBUG_WORD === word) console.log(sound);
return {
ipa: sound.ipa,
tags: sound.tags || [],
}
})

let nestedGlossObj = {};

let senseIndex = 0;
for (const sense of senses) {
const { raw_glosses, form_of, tags } = sense;

const glosses = raw_glosses || sense.glosses;

const selectedTags = (tags || []).filter(tag => ['masculine', 'feminine', 'neuter'].includes(tag));

if (glosses && glosses.length > 0) {
if (form_of) {
formStuff.push([word, sense, pos]);
} else {
if (!JSON.stringify(glosses).includes('inflection of ')) {
lemmaDict[word] ??= {};
lemmaDict[word][pos] ??= {};

lemmaDict[word][pos].ipa ??= ipa;
lemmaDict[word][pos].glosses ??= [];

if (glosses.length > 1) {
let nestedObj = nestedGlossObj;
for (const level of glosses) {
nestedObj[level] = nestedObj[level] || {};
nestedObj = nestedObj[level];
}

if (senseIndex === senses.length - 1) {
if (Object.keys(nestedGlossObj).length > 0) {
handleNest(nestedGlossObj, word, pos);
nestedGlossObj = {};
}
}
} else if (glosses.length === 1) {
if (Object.keys(nestedGlossObj).length > 0) {
handleNest(nestedGlossObj, word, pos);
nestedGlossObj = {};
}

const [gloss] = glosses;

if (!JSON.stringify(lemmaDict[word][pos].glosses).includes(gloss)) {
lemmaDict[word][pos].glosses.push(gloss);
}
}

if (selectedTags.length > 0) {
lemmaDict[word][pos].tags ??= [];
for (const tag of selectedTags) {
if (!lemmaDict[word][pos].tags.includes(tag)) {
lemmaDict[word][pos].tags.push(tag);
}
}
}
}

if (JSON.stringify(glosses).includes('inflection of ')) {
const lemma = sense.glosses[0]
.replace(/.+(?=inflection of)/, '')
.replace(/ \(.+?\)/, '')
.replace(/:$/, '')
.replace(/:\\n.+/, '')
.replace('inflection of ', '')
.replace(/:.+/s, '')
.trim();

const inflection = sense.glosses[1];

if (inflection && !inflection.includes('inflection of ') && word !== lemma) {
formDict[word] ??= {};
formDict[word][lemma] ??= {};
formDict[word][lemma][pos] ??= [];

formDict[word][lemma][pos].push(inflection);
}
}
}
}
senseIndex += 1;
}
}
});

lr.on('end', () => {
for (const [form, info, pos] of formStuff) {
const { glosses, form_of } = info;
const lemma = form_of[0].word;

if (form !== lemma) {
formDict[form] ??= {};
formDict[form][lemma] ??= {};
formDict[form][lemma][pos] ??= [];

// handle nested form glosses
const formInfo = !glosses[0].includes('##') ? glosses[0] : glosses[1];

formDict[form][lemma][pos].push(formInfo);
}
}

let missingForms = 0;

for (const [form, info] of Object.entries(automatedForms)) {
if (!formDict[form]) {
missingForms += 1;

// limit forms that point to too many lemmas
if (Object.keys(info).length < 5) {
for (const [lemma, parts] of Object.entries(info)) {
for (const [pos, glosses] of Object.entries(parts)) {
if (form !== lemma) {
formDict[form] ??= {};
formDict[form][lemma] ??= {};
formDict[form][lemma][pos] ??= [];

const modifiedGlosses = glosses.map(gloss => `-automated- ${gloss}`);
formDict[form][lemma][pos].push(...modifiedGlosses);
}
}
}
}
}
}

console.log(`There were ${missingForms.toLocaleString()} missing forms that have now been automatically populated.`);

writeFileSync(`data/tidy/${language_short}-lemmas.json`, JSON.stringify(lemmaDict));
writeFileSync(`data/tidy/${language_short}-forms.json`, JSON.stringify(formDict));


console.log('2-tidy-up.js finished.');
});

function handleLevel(nest, level) {
const nestDefs = [];
let defIndex = 0;

for (const [def, children] of Object.entries(nest)) {
defIndex += 1;

if (Object.keys(children).length > 0) {
const nextLevel = level + 1;
const childDefs = handleLevel(children, nextLevel);

const listType = level === 1 ? "li" : "number";
const content = level === 1 ? def : [{ "tag": "span", "data": { "listType": "number" }, "content": `${defIndex}. ` }, def];

nestDefs.push([{ "tag": "div", "data": { "listType": listType }, "content": content }, { "tag": "div", "data": { "listType": "ol" }, "content": childDefs }]);
} else {
nestDefs.push({ "tag": "div", "data": { "listType": "li" }, "content": [{ "tag": "span", "data": { "listType": "number" }, "content": `${defIndex}. ` }, def] });
}
}

return nestDefs;
}

function handleNest(nestedGlossObj, word, pos) {
const nestedGloss = handleLevel(nestedGlossObj, 1);

if (nestedGloss.length > 0) {
for (const entry of nestedGloss) {
lemmaDict[word][pos].glosses.push({ "type": "structured-content", "content": entry });
}
}
}
Loading

0 comments on commit 88fd8cf

Please sign in to comment.