From d0a71039d7683edacb5ee5ca2cc4bc785e014f2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20Vukovi=C4=87?= Date: Tue, 15 Oct 2024 12:21:19 +0200 Subject: [PATCH] start adding types (#152) * baseline * early exit on senses * refactor * wip * wip * wip * start adding types --- .gitignore | 2 + .vscode/settings.json | 3 + 3-tidy-up.js | 245 ++++++++++++++++------ 4-make-yomitan.js | 1 + data/test/dict/sq/en/tag_bank_1.json | 7 + data/test/dict/sq/en/term_bank_1.json | 169 +++++++++++++++ data/test/dict/sq/en/term_bank_2.json | 197 +++++++++++++++++ data/test/ipa/sq/en/tag_bank_1.json | 10 +- data/test/ipa/sq/en/term_meta_bank_1.json | 46 ++++ data/test/kaikki/sq-en.json | 3 +- data/test/tidy/sq-en-forms-0.json | 126 +++++++++++ data/test/tidy/sq-en-lemmas.json | 219 +++++++++++++++++++ jsconfig.json | 21 ++ types/types.ts | 106 ++++++++++ util/util.js | 13 +- 15 files changed, 1101 insertions(+), 67 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 jsconfig.json create mode 100644 types/types.ts diff --git a/.gitignore b/.gitignore index aef1c85..e177713 100755 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ *.json *.jsonl +!jsconfig.json +!.vscode/settings.json !tag_bank_term.json !tag_bank_ipa.json !parts_of_speech.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..be944f5 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "javascript.validate.enable": true +} \ No newline at end of file diff --git a/3-tidy-up.js b/3-tidy-up.js index 23c52cf..06fed75 100644 --- a/3-tidy-up.js +++ b/3-tidy-up.js @@ -7,20 +7,34 @@ const { target_iso: targetIso, kaikki_file: kaikkiFile, tidy_folder: writeFolder -} = process.env; +} = /** @type {TidyEnv} */ (process.env); const { sortTags, similarSort, mergePersonTags, consoleOverwrite, clearConsoleLine, logProgress, mapJsonReplacer } = require('./util/util'); +/** @type {LemmaDict} */ const lemmaDict = {}; + +/** @type {FormsMap} */ const formsMap = new Map(); + +/** @type {AutomatedForms} */ const automatedForms = new Map(); +/** + * @param {string} string + * @returns {string} +*/ function escapeRegExp(string) { return string.replace(/[.*+\-?^${}()|[\]\\]/g, '\\$&'); } +/** + * @param {string[]} glosses + * @param {FormOf[]|undefined} formOf + * @returns {boolean} + */ function isInflectionGloss(glosses, formOf) { - glossesString = JSON.stringify(glosses); + const glossesString = JSON.stringify(glosses); switch (targetIso) { case 'de': if (glosses.some(gloss => /des (?:Verbs|Adjektivs|Substantivs|Demonstrativpronomens|Possessivpronomens|Pronomens)/.test(gloss))) return true; @@ -28,6 +42,7 @@ function isInflectionGloss(glosses, formOf) { if (glosses.some(gloss => /.*inflection of.*/.test(gloss))) return true; if(!Array.isArray(formOf)) return false; for (const {word: lemma} of formOf) { + if(!lemma) continue; if (glosses.some(gloss => new RegExp(`of ${escapeRegExp(lemma)}$`).test(gloss))) return true; } @@ -38,12 +53,16 @@ function isInflectionGloss(glosses, formOf) { return false; } - -function handleLevel(nest, level) { +/** + * @param {GlossTree} glossTree + * @param {number} level + * @returns {*} + */ +function handleLevel(glossTree, level) { const nestDefs = []; let defIndex = 0; - for (const [def, children] of nest) { + for (const [def, children] of glossTree) { defIndex += 1; if(children.size > 0) { @@ -65,6 +84,10 @@ function handleLevel(nest, level) { return nestDefs; } +/** + * @param {GlossTree} glossTree + * @param {SenseInfo} sense + */ function handleNest(glossTree, sense) { const nestedGloss = handleLevel(glossTree, 1); @@ -74,15 +97,20 @@ function handleNest(glossTree, sense) { } } } - +/** + * @param {string} form + * @param {string} pos + * @param {string} lemma + * @param {string[]|Set} inflections + */ function addDeinflections(form, pos, lemma, inflections) { if (targetIso === 'fr') { form = form.replace(/(qu\')?(ils\/elles|il\/elle\/on)\s*/, ''); } - const lemmaForms = formsMap.get(lemma) || new Map(); + const lemmaForms = formsMap.get(lemma) || /** @type {Map>} */ (new Map()); formsMap.set(lemma, lemmaForms); - const formPOSs = lemmaForms.get(form) || new Map(); + const formPOSs = lemmaForms.get(form) || /** @type {Map} */ (new Map()); lemmaForms.set(form, formPOSs); formPOSs.get(pos) || formPOSs.set(pos, []); @@ -132,46 +160,26 @@ lr.on('line', (line) => { if (line) { lineCount += 1; logProgress("Processing lines", lineCount); - handleLine(line); + handleLine(JSON.parse(line)); } }); -function handleLine(line) { - const parsedLine = JSON.parse(line); +/** + * @param {KaikkiLine} parsedLine + */ +function handleLine(parsedLine) { const { pos, sounds, forms } = parsedLine; if(!pos) return; const word = getCanonicalWordForm(parsedLine); if (!word) return; - const readings = getReadings(word, parsedLine); - if (forms) { - forms.forEach((formData) => { - const { form } = formData; - let { tags } = formData; - if(!form) return; - if(!tags) return; - if(form === '-') return; - tags = tags.filter(tag => !redundantTags.includes(tag)); - const isBlacklisted = tags.some(value => blacklistedTags.includes(value)); - if (isBlacklisted) return; - const isIdentity = !tags.some(value => !identityTags.includes(value)); - if (isIdentity) return; - - const wordMap = automatedForms.get(word) || new Map(); - const formMap = wordMap.get(form) || new Map(); - formMap.get(pos) || formMap.set(pos, new Set()); - wordMap.set(form, formMap); - automatedForms.set(word, wordMap); - - const tagsSet = new Set((formMap.get(pos))); - - tagsSet.add(sortTags(targetIso, tags).join(' ')); - - formMap.set(pos, similarSort(mergePersonTags(targetIso, Array.from(tagsSet)))); - }); - } + processForms(forms, word, pos); + + const {senses} = parsedLine; + if (!senses) return; - const ipa = sounds + /** @type {IpaInfo[]} */ + const ipa = /** @type {IpaInfo[]} */ (sounds ? sounds .filter(sound => sound && sound.ipa) .map(({ipa, tags, note}) => { @@ -184,16 +192,14 @@ function handleLine(line) { } return ({ipa, tags}) }) - .flatMap(ipaObj => typeof ipaObj.ipa === 'string' ? [ipaObj] : ipaObj.ipa.map(ipa => ({ ipa, tags: ipaObj.tags })) ) - .filter(ipaObj => ipaObj.ipa) - : []; - + .flatMap(ipaObj => typeof ipaObj.ipa === 'string' ? [ipaObj] : ipaObj?.ipa?.map(ipa => ({ ipa, tags: ipaObj.tags })) ) + .filter(ipaObj => ipaObj?.ipa) + : []); - const {senses} = parsedLine; - if (!senses) return; - - const sensesWithGlosses = senses.filter(sense => sense.glosses || sense.raw_glosses || sense.raw_gloss); - sensesWithGlosses.map(sense => { + /** @type {TidySense[]} */ + const sensesWithGlosses = /** @type {TidySense[]} */ (senses + .filter(sense => sense.glosses || sense.raw_glosses || sense.raw_gloss) + .map(sense => { const glosses = sense.raw_glosses || sense.raw_gloss || sense.glosses; const glossesArray = Array.isArray(glosses) ? glosses : [glosses]; @@ -202,9 +208,8 @@ function handleLine(line) { tags.push(...sense.raw_tags); } - sense.glossesArray = glossesArray; - sense.tags = tags; - }); + return {...sense, glossesArray, tags}; + })); const sensesWithoutInflectionGlosses = sensesWithGlosses.filter(sense => { const {glossesArray, form_of, glosses} = sense; @@ -214,38 +219,44 @@ function handleLine(line) { }); if (sensesWithoutInflectionGlosses.length === 0) return; - + + const readings = getReadings(word, parsedLine); initializeWordResult(word, readings, pos); for (const ipaObj of ipa) { saveIpaResult(word, readings, pos, ipaObj); } + /** @type {GlossTree} */ const glossTree = new Map(); for (const sense of sensesWithoutInflectionGlosses) { const { glossesArray, tags } = sense; let temp = glossTree; for (const [levelIndex, levelGloss] of glossesArray.entries()) { - if(!temp.get(levelGloss)) { - temp.set(levelGloss, new Map()); + let curr = temp.get(levelGloss); + if(!curr) { + curr = new Map(); + temp.set(levelGloss, curr); if(levelIndex === 0) { - temp.get(levelGloss).set('_tags', tags); + curr.set('_tags', tags); } } else if (levelIndex === 0) { - temp.get(levelGloss).set('_tags', tags.filter(value => temp.get(levelGloss).get('_tags').includes(value))); + curr.set('_tags', tags.filter(value => curr?.get('_tags')?.includes(value))); } - temp = temp.get(levelGloss); + temp = curr; } } for (const [gloss, children] of glossTree) { - const tags = children.get('_tags'); - children.delete('_tags'); + const tags = children.get('_tags') || []; + children.delete('_tags'); + /** @type {SenseInfo} */ const currSense = { glosses: [], tags }; if(children.size === 0) { currSense.glosses.push(gloss); } else { + /** @type {GlossTree} */ const branch = new Map(); branch.set(gloss, children); handleNest(branch, currSense); @@ -257,12 +268,59 @@ function handleLine(line) { } } +/** + * @param {FormInfo[]|undefined} forms + * @param {string} word + * @param {string} pos + */ +function processForms(forms, word, pos) { + if(!forms) return; + forms.forEach((formData) => { + const { form } = formData; + let { tags } = formData; + if (!form) return; + if (!tags) return; + if (form === '-') return; + tags = tags.filter(tag => !redundantTags.includes(tag)); + const isBlacklisted = tags.some(value => blacklistedTags.includes(value)); + if (isBlacklisted) return; + const isIdentity = !tags.some(value => !identityTags.includes(value)); + if (isIdentity) return; + + /** @type {Map>>} */ + const wordMap = automatedForms.get(word) || new Map(); + /** @type {Map|string[]>} */ + const formMap = wordMap.get(form) || new Map(); + formMap.get(pos) || formMap.set(pos, new Set()); + wordMap.set(form, formMap); + automatedForms.set(word, wordMap); + + const tagsSet = new Set((formMap.get(pos))); + + tagsSet.add(sortTags(targetIso, tags).join(' ')); + + formMap.set(pos, similarSort(mergePersonTags(targetIso, Array.from(tagsSet)))); + }); +} + +/** + * @param {string} word + * @param {string[]} readings + * @param {string} pos + * @param {SenseInfo} currSense + */ function saveSenseResult(word, readings, pos, currSense) { for (const reading of readings) { lemmaDict[word][reading][pos].senses.push(currSense); } } +/** + * @param {string} word + * @param {string[]} readings + * @param {string} pos + * @param {IpaInfo} ipaObj + */ function saveIpaResult(word, readings, pos, ipaObj) { for (const reading of readings) { const result = lemmaDict[word][reading][pos]; @@ -272,6 +330,11 @@ function saveIpaResult(word, readings, pos, ipaObj) { } } +/** + * @param {string} word + * @param {string[]} readings + * @param {string} pos + */ function initializeWordResult(word, readings, pos) { for (const reading of readings) { const result = ensureNestedObject(lemmaDict, [word, reading, pos]); @@ -280,6 +343,12 @@ function initializeWordResult(word, readings, pos) { } } +/** + * @param {Glosses|undefined} glosses + * @param {string} word + * @param {string} pos + * @returns + */ function processInflectionGlosses(glosses, word, pos) { switch (targetIso) { case 'de': @@ -287,6 +356,10 @@ function processInflectionGlosses(glosses, word, pos) { case 'en': return processEnglishInflectionGlosses(glosses, word, pos); case 'fr': + if(!glosses) return; + /** + * @type {string|undefined} + */ let inflection, lemma; const match1 = glosses[0].match(/(.*)du verbe\s+((?:(?!\bdu\b).)*)$/); @@ -312,7 +385,14 @@ function processInflectionGlosses(glosses, word, pos) { } } +/** + * @param {Glosses|undefined} glosses + * @param {string} word + * @param {string} pos + * @returns + */ function processGermanInflectionGlosses(glosses, word, pos) { + if (!glosses || !Array.isArray(glosses)) return; const match1 = glosses[0].match(/(.*)des (?:Verbs|Adjektivs|Substantivs|Demonstrativpronomens|Possessivpronomens|Pronomens) (.*)$/); if (!match1 || match1.length < 3) return; const inflection = match1[1].trim(); @@ -322,6 +402,11 @@ function processGermanInflectionGlosses(glosses, word, pos) { } } +/** + * @param {NestedObject} obj + * @param {string[]} keys + * @returns {NestedObject} + */ function ensureNestedObject(obj, keys) { for (const key of keys) { obj[key] ??= {}; @@ -330,10 +415,17 @@ function ensureNestedObject(obj, keys) { return obj; } +/** + * @param {Glosses|undefined} glosses + * @param {string} word + * @param {string} pos + */ function processEnglishInflectionGlosses(glosses, word, pos) { - if(!glosses) return; - glossPieces = glosses.flatMap(gloss => gloss.split('##').map(piece => piece.trim())); + if(!glosses || !Array.isArray(glosses)) return; + const glossPieces = glosses.flatMap(gloss => gloss.split('##').map(piece => piece.trim())); + /** @type {Set} */ const lemmas = new Set(); + /** @type {Set} */ const inflections = new Set(); for (const piece of glossPieces) { const lemmaMatch = piece.match(/of ([^\s]+)\s*$/); @@ -371,6 +463,10 @@ function processEnglishInflectionGlosses(glosses, word, pos) { } } +/** + * @param {KaikkiLine} line + * @returns {string|undefined} + */ function getCanonicalWordForm({word, forms}) { if(!forms) return word; @@ -389,6 +485,11 @@ function getCanonicalWordForm({word, forms}) { } } +/** + * @param {string|undefined} word + * @param {FormInfo[]} forms + * @returns {string|undefined} + */ function getCanonicalForm(word, forms) { const canonicalForm = forms.find(form => form.tags && form.tags.includes('canonical') @@ -400,7 +501,7 @@ function getCanonicalForm(word, forms) { word = word.replace(/ {{#if:.+/, '').trim(); } - bracketsRegex = /\[.*\]$/; + const bracketsRegex = /\[.*\]$/; if (bracketsRegex.test(word)) { word = word.replace(bracketsRegex, '').trim(); } @@ -408,6 +509,11 @@ function getCanonicalForm(word, forms) { return word; } +/** + * @param {string} word + * @param {KaikkiLine} line + * @returns {string[]} + */ function getReadings(word, line){ switch(sourceIso){ case 'fa': return [getPersianReading(word, line)]; @@ -417,13 +523,23 @@ function getReadings(word, line){ } } +/** + * @param {string} word + * @param {KaikkiLine} line + * @returns {string} + */ function getPersianReading(word, line){ const {forms} = line; if(!forms) return word; const romanization = forms.find(({form, tags}) => tags && tags.includes('romanization') && tags.length === 1 && form); - return romanization ? romanization.form : word; + return romanization?.form || word; } +/** + * @param {string} word + * @param {KaikkiLine} line + * @returns {string[]} + */ function getJapaneseReadings(word, line){ const {head_templates} = line; if(!head_templates) { @@ -507,13 +623,14 @@ lr.on('end', () => { const formsFilePath = `${writeFolder}/${sourceIso}-${targetIso}-forms.json`; + /** @type {{[chunkIndex: string]: FormsMap}} */ const mapChunks = Array.from(formsMap.entries()).reduce((acc, [key, value], index) => { logProgress("Chunking form dict", index, formsMap.size); const chunkIndex = Math.floor(index / 10000); acc[chunkIndex] ??= new Map(); acc[chunkIndex].set(key, value); return acc; - }, {}); + }, /** @type {{[chunkIndex: string]: FormsMap}} */ ({})); if(!mapChunks['0']) { mapChunks['0'] = new Map(); diff --git a/4-make-yomitan.js b/4-make-yomitan.js index 32a9e47..5942af4 100644 --- a/4-make-yomitan.js +++ b/4-make-yomitan.js @@ -1,3 +1,4 @@ +//@ts-nocheck const path = require('path'); const { readFileSync, writeFileSync, existsSync, readdirSync, mkdirSync, unlinkSync } = require('fs'); const { sortTags, writeInBatches, consoleOverwrite, diff --git a/data/test/dict/sq/en/tag_bank_1.json b/data/test/dict/sq/en/tag_bank_1.json index 0143cd3..895bfd1 100644 --- a/data/test/dict/sq/en/tag_bank_1.json +++ b/data/test/dict/sq/en/tag_bank_1.json @@ -12,5 +12,12 @@ -1, "masculine", 1 + ], + [ + "fem", + "", + -1, + "feminine", + 1 ] ] \ No newline at end of file diff --git a/data/test/dict/sq/en/term_bank_1.json b/data/test/dict/sq/en/term_bank_1.json index d9ba885..4672f1e 100644 --- a/data/test/dict/sq/en/term_bank_1.json +++ b/data/test/dict/sq/en/term_bank_1.json @@ -10,5 +10,174 @@ ], 0, "" + ], + [ + "gjuhë", + "", + "n fem", + "n", + 0, + [ + { + "type": "structured-content", + "content": [ + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": "tongue (organ)" + }, + { + "tag": "div", + "data": { + "listType": "ol" + }, + "style": { + "marginLeft": 2 + }, + "content": [ + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": [ + { + "tag": "span", + "data": { + "listType": "number" + }, + "content": "1. " + }, + "(figurative) speech, talking" + ] + }, + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": [ + { + "tag": "span", + "data": { + "listType": "number" + }, + "content": "2. " + }, + "strip of land" + ] + }, + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": [ + { + "tag": "span", + "data": { + "listType": "number" + }, + "content": "3. " + }, + "bell clapper, clanger, tongue" + ] + } + ] + } + ] + }, + { + "type": "structured-content", + "content": [ + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": "language, tongue" + }, + { + "tag": "div", + "data": { + "listType": "ol" + }, + "style": { + "marginLeft": 2 + }, + "content": [ + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": [ + { + "tag": "span", + "data": { + "listType": "number" + }, + "content": "1. " + }, + "register, speech, style" + ] + }, + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": [ + { + "tag": "span", + "data": { + "listType": "number" + }, + "content": "2. " + }, + "language (generally, any form of communication)" + ] + }, + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": [ + { + "tag": "span", + "data": { + "listType": "number" + }, + "content": "3. " + }, + "(colloquial) local dialect" + ] + }, + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": [ + { + "tag": "span", + "data": { + "listType": "number" + }, + "content": "4. " + }, + "(colloquial) Albanian, as a subject in school" + ] + } + ] + } + ] + } + ], + 0, + "" ] ] \ No newline at end of file diff --git a/data/test/dict/sq/en/term_bank_2.json b/data/test/dict/sq/en/term_bank_2.json index 7f01320..f9af542 100644 --- a/data/test/dict/sq/en/term_bank_2.json +++ b/data/test/dict/sq/en/term_bank_2.json @@ -34,5 +34,202 @@ ], 0, "" + ], + [ + "gjuhëra/gjuhëna", + "", + "non-lemma", + "", + 0, + [ + [ + "gjuhë", + [ + "plural", + "dialectal" + ] + ] + ], + 0, + "" + ], + [ + "gjuha", + "", + "non-lemma", + "", + 0, + [ + [ + "gjuhë", + [ + "definite" + ] + ], + [ + "gjuhë", + [ + "nominative", + "singular", + "definite" + ] + ] + ], + 0, + "" + ], + [ + "gjuhët", + "", + "non-lemma", + "", + 0, + [ + [ + "gjuhë", + [ + "accusative", + "plural", + "definite" + ] + ], + [ + "gjuhë", + [ + "nominative", + "plural", + "definite" + ] + ] + ], + 0, + "" + ], + [ + "gjuhën", + "", + "non-lemma", + "", + 0, + [ + [ + "gjuhë", + [ + "accusative", + "singular", + "definite" + ] + ] + ], + 0, + "" + ], + [ + "gjuhe", + "", + "non-lemma", + "", + 0, + [ + [ + "gjuhë", + [ + "singular", + "indefinite", + "ablative" + ] + ], + [ + "gjuhë", + [ + "dative", + "singular", + "indefinite" + ] + ] + ], + 0, + "" + ], + [ + "gjuhës", + "", + "non-lemma", + "", + 0, + [ + [ + "gjuhë", + [ + "singular", + "definite", + "ablative" + ] + ], + [ + "gjuhë", + [ + "dative", + "singular", + "definite" + ] + ] + ], + 0, + "" + ], + [ + "gjuhëve", + "", + "non-lemma", + "", + 0, + [ + [ + "gjuhë", + [ + "plural", + "definite", + "ablative" + ] + ], + [ + "gjuhë", + [ + "dative", + "plural", + "definite" + ] + ], + [ + "gjuhë", + [ + "dative", + "plural", + "indefinite" + ] + ] + ], + 0, + "" + ], + [ + "gjuhësh", + "", + "non-lemma", + "", + 0, + [ + [ + "gjuhë", + [ + "plural", + "indefinite", + "ablative" + ] + ] + ], + 0, + "" ] ] \ No newline at end of file diff --git a/data/test/ipa/sq/en/tag_bank_1.json b/data/test/ipa/sq/en/tag_bank_1.json index 0637a08..0dd99bc 100644 --- a/data/test/ipa/sq/en/tag_bank_1.json +++ b/data/test/ipa/sq/en/tag_bank_1.json @@ -1 +1,9 @@ -[] \ No newline at end of file +[ + [ + "Gheg", + "dialect", + 0, + "Gheg", + 0 + ] +] \ No newline at end of file diff --git a/data/test/ipa/sq/en/term_meta_bank_1.json b/data/test/ipa/sq/en/term_meta_bank_1.json index 27aa487..32e1f48 100644 --- a/data/test/ipa/sq/en/term_meta_bank_1.json +++ b/data/test/ipa/sq/en/term_meta_bank_1.json @@ -11,5 +11,51 @@ } ] } + ], + [ + "gjuhë", + "ipa", + { + "reading": "gjuhë", + "transcriptions": [ + { + "ipa": "/ˈɟuhə/", + "tags": [] + }, + { + "ipa": "[ˈɡjuː(h)]", + "tags": [ + "Gheg", + "Northern" + ] + }, + { + "ipa": "[ˈɡuː(h)]", + "tags": [ + "Gheg", + "Northern" + ] + }, + { + "ipa": "[ˈɡũː]", + "tags": [ + "Kosovo" + ] + }, + { + "ipa": "[ˈɡʎuhə]", + "tags": [ + "Arbëresh", + "Arvanitika" + ] + }, + { + "ipa": "[ˈɡʎuɣə]", + "tags": [ + "Calabria" + ] + } + ] + } ] ] \ No newline at end of file diff --git a/data/test/kaikki/sq-en.json b/data/test/kaikki/sq-en.json index b6169d9..1c2e7cd 100644 --- a/data/test/kaikki/sq-en.json +++ b/data/test/kaikki/sq-en.json @@ -1,2 +1,3 @@ {"pos": "noun", "head_templates": [{"name": "head", "args": {"1": "sq", "2": "noun", "head": "", "sort": "", "g": "m", "cat2": "masculine nouns"}, "expansion": "akull m"}, {"name": "sq-noun", "args": {"1": "m", "2": "akuj"}, "expansion": "akull m (plural akuj)"}], "forms": [{"form": "akuj", "tags": ["plural"]}], "etymology_number": 1, "wikipedia": ["Vladimir Orel"], "etymology_text": "Uncertain. Possibly:\n# A derivation from Proto-Indo-European *keHl- whence also Proto-Celtic *kaletos (“hard”), Proto-Slavic *kaliti (“to temper, harden”), Latin callum (“hardened skin”).\n# Borrowed from Germanic, ultimately from Proto-Germanic *jekulaz (“icicle”).\n# Akin Old Armenian ոյծ (oyc, “cold, frost”), suffixed with -ull, though the two terms are phonologically incompatible.", "etymology_templates": [{"name": "unc", "args": {"1": "sq"}, "expansion": "Uncertain"}, {"name": "der", "args": {"1": "sq", "2": "ine-pro", "3": "", "4": "*keHl-"}, "expansion": "Proto-Indo-European *keHl-"}, {"name": "cog", "args": {"1": "cel-pro", "2": "*kaletos", "t": "hard"}, "expansion": "Proto-Celtic *kaletos (“hard”)"}, {"name": "cog", "args": {"1": "sla-pro", "2": "*kaliti", "t": "to temper, harden"}, "expansion": "Proto-Slavic *kaliti (“to temper, harden”)"}, {"name": "cog", "args": {"1": "la", "2": "callum", "t": "hardened skin"}, "expansion": "Latin callum (“hardened skin”)"}, {"name": "glossary", "args": {"1": "loanword", "2": "Borrowed"}, "expansion": "Borrowed"}, {"name": "bor", "args": {"1": "sq", "2": "gem", "3": "", "4": "", "5": "", "lit": "", "pos": "", "tr": "", "ts": "", "id": "", "sc": "", "g": "", "g2": "", "g3": "", "nocat": "", "sort": ""}, "expansion": "Germanic"}, {"name": "bor+", "args": {"1": "sq", "2": "gem"}, "expansion": "Borrowed from Germanic"}, {"name": "der", "args": {"1": "sq", "2": "gem-pro", "3": "*jekulaz", "t": "icicle"}, "expansion": "Proto-Germanic *jekulaz (“icicle”)"}, {"name": "cog", "args": {"1": "xcl", "2": "ոյծ", "t": "cold, frost"}, "expansion": "Old Armenian ոյծ (oyc, “cold, frost”)"}, {"name": "af", "args": {"1": "sq", "2": "-ull"}, "expansion": "-ull"}], "sounds": [{"ipa": "/ˈakuɫ/"}, {"rhymes": "-akuɫ"}], "word": "akull", "lang": "Albanian", "lang_code": "sq", "senses": [{"links": [["ice", "ice"]], "glosses": ["ice"], "tags": ["masculine"], "id": "akull-sq-noun-TLCyUMYl"}]} -{"pos": "verb", "head_templates": [{"name": "head", "args": {"1": "sq", "2": "verb form"}, "expansion": "ik"}], "word": "ik", "lang": "Albanian", "lang_code": "sq", "senses": [{"links": [["iki", "iki#Albanian"]], "glosses": ["second-person singular imperative of iki"], "tags": ["form-of", "imperative", "second-person", "singular"], "form_of": [{"word": "iki"}], "id": "ik-sq-verb-OAavRVQB", "categories": []}]} \ No newline at end of file +{"pos": "verb", "head_templates": [{"name": "head", "args": {"1": "sq", "2": "verb form"}, "expansion": "ik"}], "word": "ik", "lang": "Albanian", "lang_code": "sq", "senses": [{"links": [["iki", "iki#Albanian"]], "glosses": ["second-person singular imperative of iki"], "tags": ["form-of", "imperative", "second-person", "singular"], "form_of": [{"word": "iki"}], "id": "ik-sq-verb-OAavRVQB", "categories": []}]} +{"pos": "noun", "forms": [{"form": "gjuhë", "tags": ["plural"]}, {"form": "gjuhëra/gjuhëna", "tags": ["dialectal", "plural"]}, {"form": "gjuha", "tags": ["definite"]}, {"form": "no-table-tags", "source": "declension", "tags": ["table-tags"]}, {"form": "sq-noun-f", "source": "declension", "tags": ["inflection-template"]}, {"form": "gjuhë", "tags": ["indefinite", "nominative", "singular"], "source": "declension"}, {"form": "gjuha", "tags": ["definite", "nominative", "singular"], "source": "declension"}, {"form": "gjuhë", "tags": ["indefinite", "nominative", "plural"], "source": "declension"}, {"form": "gjuhët", "tags": ["definite", "nominative", "plural"], "source": "declension"}, {"form": "gjuhë", "tags": ["accusative", "indefinite", "singular"], "source": "declension"}, {"form": "gjuhën", "tags": ["accusative", "definite", "singular"], "source": "declension"}, {"form": "gjuhë", "tags": ["accusative", "indefinite", "plural"], "source": "declension"}, {"form": "gjuhët", "tags": ["accusative", "definite", "plural"], "source": "declension"}, {"form": "gjuhe", "tags": ["dative", "indefinite", "singular"], "source": "declension"}, {"form": "gjuhës", "tags": ["dative", "definite", "singular"], "source": "declension"}, {"form": "gjuhëve", "tags": ["dative", "indefinite", "plural"], "source": "declension"}, {"form": "gjuhëve", "tags": ["dative", "definite", "plural"], "source": "declension"}, {"form": "gjuhe", "tags": ["ablative", "indefinite", "singular"], "source": "declension"}, {"form": "gjuhës", "tags": ["ablative", "definite", "singular"], "source": "declension"}, {"form": "gjuhësh", "tags": ["ablative", "indefinite", "plural"], "source": "declension"}, {"form": "gjuhëve", "tags": ["ablative", "definite", "plural"], "source": "declension"}], "inflection_templates": [{"name": "sq-decl-noun", "args": {"1": "gjuhë", "2": "gjuhë", "3": "gjuha", "4": "gjuhët", "5": "gjuhë", "6": "gjuhë", "7": "gjuhën", "8": "gjuhët", "9": "gjuhe", "10": "gjuhëve", "11": "gjuhës", "12": "gjuhëve", "13": "gjuhësh"}}], "sounds": [{"ipa": "/ˈɟuhə/"}, {"tags": ["Gheg", "Northern"], "ipa": "[ˈɡjuː(h)]"}, {"tags": ["Gheg", "Northern"], "ipa": "[ˈɡuː(h)]"}, {"tags": ["Kosovo"], "ipa": "[ˈɡũː]"}, {"tags": ["Arbëresh", "Arvanitika"], "ipa": "[ˈɡʎuhə]"}, {"note": "Calabria", "ipa": "[ˈɡʎuɣə]"}, {"rhymes": "-uhə"}], "wikipedia": ["Vladimir Orel"], "etymology_text": "Unclear. Akin to Arbëresh glunzë (“voice”). Possibilities include:\n# Inherited from Proto-Indo-European *gol(H)-s-os, via a byform *gl̥(H)-s-ós, whence also Proto-Slavic *golsъ (“voice”), Lithuanian gal̃sas (“voice”), Proto-Germanic *kalz-ōną (“to call”). However the medial -h- instead of expected **-sh- is left unexplained.\n# From a byform *ǵʰnud-sḱ-eh₂, doubly methasised from Proto-Indo-European *dn̥ǵʰwéh₂s ~ *dn̥ǵʰuh₂és (“tongue”). Compare Tocharian B kantwo, also metathised. The outcome gl- (and later gj-) from original *ǵ(ʰ)n- is also attested in gju (“knee”). The usage of the infixed *-sḱ- does not seem have any parallels.\n# A connection with Ancient Greek γλῶσσα (glôssa), itself of unclear origin, cannot be proven.", "etymology_templates": [{"name": "unk", "args": {"1": "sq", "2": "Unclear"}, "expansion": "Unclear"}, {"name": "glossary", "args": {"1": "Inherited"}, "expansion": "Inherited"}, {"name": "inh", "args": {"1": "sq", "2": "ine-pro", "3": "", "4": "*gol(H)-s-os", "5": "", "lit": "", "pos": "", "tr": "", "ts": "", "id": "", "sc": "", "g": "", "g2": "", "g3": "", "nocat": "", "sort": ""}, "expansion": "Proto-Indo-European *gol(H)-s-os"}, {"name": "inh+", "args": {"1": "sq", "2": "ine-pro", "3": "", "4": "*gol(H)-s-os"}, "expansion": "Inherited from Proto-Indo-European *gol(H)-s-os"}, {"name": "cog", "args": {"1": "sla-pro", "2": "*golsъ", "t": "voice"}, "expansion": "Proto-Slavic *golsъ (“voice”)"}, {"name": "cog", "args": {"1": "lt", "2": "gal̃sas", "t": "voice"}, "expansion": "Lithuanian gal̃sas (“voice”)"}, {"name": "cog", "args": {"1": "gem-pro", "2": "*kalzōną", "3": "*kalz-ōną", "t": "to call"}, "expansion": "Proto-Germanic *kalz-ōną (“to call”)"}, {"name": "der", "args": {"1": "sq", "2": "ine-pro", "3": "*dn̥ǵʰwéh₂s", "4": "*dn̥ǵʰwéh₂s ~ *dn̥ǵʰuh₂és", "t": "tongue"}, "expansion": "Proto-Indo-European *dn̥ǵʰwéh₂s ~ *dn̥ǵʰuh₂és (“tongue”)"}, {"name": "cog", "args": {"1": "txb", "2": "kantwo"}, "expansion": "Tocharian B kantwo"}, {"name": "cog", "args": {"1": "grc", "2": "γλῶσσα"}, "expansion": "Ancient Greek γλῶσσα (glôssa)"}], "word": "gjuhë", "lang": "Albanian", "lang_code": "sq", "synonyms": [{"tags": ["obsolete"], "word": "gluhë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "now Cham", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "Arbëresh", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "Arvanitika", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"tags": ["obsolete"], "word": "gjuhu", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "Gheg", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhunë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"tags": ["Gheg"], "word": "gjuhënë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"tags": ["Gheg", "Northern"], "word": "guhë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"tags": ["dialectal"], "word": "gû", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "Kosovo", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gju — Borgo Erizzo", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"english": "Sicily", "word": "gëluhë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"english": "Calabria", "word": "gjufë", "_dis1": "0 0 0 0 0 0 0 0 0"}], "derived": [{"word": "dygjuhësi", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "dygjuhësh", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhcë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhësi", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhësisht", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhësor", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëtar", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëz", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëzoj", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhor", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "shumëgjuhësh", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhë letrare", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhë nëne", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëbilbil", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëbrisk", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëçarë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëdele", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëdreri", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëgjarpër", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëgjatë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëhelm", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëkrijues", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëkuq", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëlashtë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëlëshuar", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëlidhur", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëlopatë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëlopë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëllomkë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëmbajtur", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëmite", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëmpirë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëmprehtë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhënepërkë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhënuse", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëpremë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëprerë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëqen", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhërrënduar", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëshkurtër", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëshkurtuar", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëshpatë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhështhurur", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëtrashë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëtharë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëthikë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhëzënë", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhujëse", "_dis1": "0 0 0 0 0 0 0 0 0"}, {"word": "gjuhustër", "_dis1": "0 0 0 0 0 0 0 0 0"}], "senses": [{"examples": [{"text": "Mbaje gjuhën!", "english": "Hold your tongue!", "type": "example"}, {"text": "E ka gjuhën të gjatë.", "english": "(literally, “She has a long tongue.”)", "type": "example", "roman": "She is very talkative."}], "links": [["tongue", "tongue"], ["speech", "speech"], ["talking", "talking"]], "raw_glosses": ["tongue (organ)", "(figurative) speech, talking"], "glosses": ["tongue (organ)", "speech, talking"], "synonyms": [{"word": "gojë"}], "tags": ["feminine", "figuratively"], "id": "en-gjuhë-sq-noun-4U3OJriL", "categories": [{"name": "Albanian terms with collocations", "kind": "other", "parents": ["Terms with collocations", "Entry maintenance"], "source": "w"}, {"name": "Body parts", "kind": "topical", "parents": ["Body", "Anatomy", "All topics", "Biology", "Medicine", "Fundamental", "Sciences", "Healthcare", "Health"], "source": "w", "orig": "sq:Body parts", "langcode": "sq"}]}, {"links": [["tongue", "tongue"], ["strip", "strip"], ["land", "land"]], "glosses": ["tongue (organ)", "strip of land"], "synonyms": [{"word": "rrip"}], "tags": ["feminine"], "id": "en-gjuhë-sq-noun-Tfx~l-b2", "categories": [{"name": "Albanian terms with collocations", "kind": "other", "parents": ["Terms with collocations", "Entry maintenance"], "source": "w"}, {"name": "Body parts", "kind": "topical", "parents": ["Body", "Anatomy", "All topics", "Biology", "Medicine", "Fundamental", "Sciences", "Healthcare", "Health"], "source": "w", "orig": "sq:Body parts", "langcode": "sq"}]}, {"links": [["tongue", "tongue"], ["bell", "bell"], ["clapper", "clapper"], ["clanger", "clanger"]], "glosses": ["tongue (organ)", "bell clapper, clanger, tongue"], "synonyms": [{"word": "gjuhëz"}], "tags": ["feminine"], "id": "en-gjuhë-sq-noun-zu-bA4a3", "categories": [{"name": "Albanian terms with collocations", "kind": "other", "parents": ["Terms with collocations", "Entry maintenance"], "source": "w"}, {"name": "Body parts", "kind": "topical", "parents": ["Body", "Anatomy", "All topics", "Biology", "Medicine", "Fundamental", "Sciences", "Healthcare", "Health"], "source": "w", "orig": "sq:Body parts", "langcode": "sq"}]}, {"examples": [{"text": "gjuhë lope e zier", "english": "boiled beef tongue", "type": "example"}, {"text": "Dogji gjuhën.", "english": "I burned my tongue.", "type": "example"}], "links": [["tongue", "tongue"]], "glosses": ["tongue (organ)"], "tags": ["feminine"], "id": "en-gjuhë-sq-noun-r4b272FF", "categories": [{"name": "Albanian terms with collocations", "kind": "other", "parents": ["Terms with collocations", "Entry maintenance"], "source": "w"}, {"name": "Body parts", "kind": "topical", "parents": ["Body", "Anatomy", "All topics", "Biology", "Medicine", "Fundamental", "Sciences", "Healthcare", "Health"], "source": "w", "orig": "sq:Body parts", "langcode": "sq"}]}, {"examples": [{"text": "gjuha e fëmijëve", "english": "children speech", "type": "example"}, {"text": "gjuhë e trashë", "english": "foul language", "type": "example"}, {"text": "gjuha e shkrimtarit", "english": "the author's style", "type": "example"}], "links": [["language", "language"], ["tongue", "tongue"], ["register", "register"], ["speech", "speech"], ["style", "style"]], "glosses": ["language, tongue", "register, speech, style"], "synonyms": [{"word": "ligjërim"}, {"word": "stil"}], "tags": ["feminine"], "id": "en-gjuhë-sq-noun--CHs0sns", "categories": [{"name": "Albanian terms with collocations", "kind": "other", "parents": ["Terms with collocations", "Entry maintenance"], "source": "w"}]}, {"examples": [{"text": "gjuha e muzikës", "english": "music's language", "type": "example"}, {"text": "gjuha e bletëve", "english": "bees' language", "type": "example"}], "links": [["language", "language"], ["tongue", "tongue"]], "glosses": ["language, tongue", "language (generally, any form of communication)"], "tags": ["feminine"], "id": "en-gjuhë-sq-noun-nlIefoUV", "categories": [{"name": "Albanian terms with collocations", "kind": "other", "parents": ["Terms with collocations", "Entry maintenance"], "source": "w"}]}, {"links": [["language", "language"], ["tongue", "tongue"], ["dialect", "dialect"]], "raw_glosses": ["language, tongue", "(colloquial) local dialect"], "glosses": ["language, tongue", "local dialect"], "synonyms": [{"word": "e folme"}, {"word": "dialekt"}], "tags": ["colloquial", "feminine"], "id": "en-gjuhë-sq-noun-mWdoYa8o", "categories": [{"name": "Albanian terms with collocations", "kind": "other", "parents": ["Terms with collocations", "Entry maintenance"], "source": "w"}]}, {"links": [["language", "language"], ["tongue", "tongue"], ["Albanian", "Albanian"], ["subject", "subject"]], "raw_glosses": ["language, tongue", "(colloquial) Albanian, as a subject in school"], "glosses": ["language, tongue", "Albanian, as a subject in school"], "tags": ["colloquial", "feminine"], "id": "en-gjuhë-sq-noun-7CKeEbtj", "categories": [{"name": "Albanian terms with collocations", "kind": "other", "parents": ["Terms with collocations", "Entry maintenance"], "source": "w"}, {"name": "Albanian entries with incorrect language header", "kind": "other", "parents": ["Entries with incorrect language header", "Entry maintenance"], "source": "w+disamb", "_dis": "3 3 4 3 16 16 16 23 16"}, {"name": "Pages with 1 entry", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "4 2 7 2 9 10 9 48 9"}, {"name": "Pages with entries", "kind": "other", "parents": [], "source": "w+disamb", "_dis": "4 2 3 2 9 15 9 47 9"}]}, {"examples": [{"text": "gjuha shqipe", "english": "the Albanian language", "type": "example"}], "links": [["language", "language"], ["tongue", "tongue"]], "glosses": ["language, tongue"], "tags": ["feminine"], "id": "en-gjuhë-sq-noun-GSYYUYQQ", "categories": [{"name": "Albanian terms with collocations", "kind": "other", "parents": ["Terms with collocations", "Entry maintenance"], "source": "w"}]}]} \ No newline at end of file diff --git a/data/test/tidy/sq-en-forms-0.json b/data/test/tidy/sq-en-forms-0.json index bde81ba..76f289c 100644 --- a/data/test/tidy/sq-en-forms-0.json +++ b/data/test/tidy/sq-en-forms-0.json @@ -44,6 +44,132 @@ ] ] } + ], + [ + "gjuhë", + { + "_type": "map", + "map": [ + [ + "gjuhëra/gjuhëna", + { + "_type": "map", + "map": [ + [ + "noun", + [ + "plural dialectal" + ] + ] + ] + } + ], + [ + "gjuha", + { + "_type": "map", + "map": [ + [ + "noun", + [ + "definite", + "nominative singular definite" + ] + ] + ] + } + ], + [ + "gjuhët", + { + "_type": "map", + "map": [ + [ + "noun", + [ + "accusative plural definite", + "nominative plural definite" + ] + ] + ] + } + ], + [ + "gjuhën", + { + "_type": "map", + "map": [ + [ + "noun", + [ + "accusative singular definite" + ] + ] + ] + } + ], + [ + "gjuhe", + { + "_type": "map", + "map": [ + [ + "noun", + [ + "singular indefinite ablative", + "dative singular indefinite" + ] + ] + ] + } + ], + [ + "gjuhës", + { + "_type": "map", + "map": [ + [ + "noun", + [ + "singular definite ablative", + "dative singular definite" + ] + ] + ] + } + ], + [ + "gjuhëve", + { + "_type": "map", + "map": [ + [ + "noun", + [ + "plural definite ablative", + "dative plural definite", + "dative plural indefinite" + ] + ] + ] + } + ], + [ + "gjuhësh", + { + "_type": "map", + "map": [ + [ + "noun", + [ + "plural indefinite ablative" + ] + ] + ] + } + ] + ] + } ] ] } \ No newline at end of file diff --git a/data/test/tidy/sq-en-lemmas.json b/data/test/tidy/sq-en-lemmas.json index 1956d77..b8482da 100644 --- a/data/test/tidy/sq-en-lemmas.json +++ b/data/test/tidy/sq-en-lemmas.json @@ -20,5 +20,224 @@ ] } } + }, + "gjuhë": { + "gjuhë": { + "noun": { + "ipa": [ + { + "ipa": "/ˈɟuhə/", + "tags": [] + }, + { + "ipa": "[ˈɡjuː(h)]", + "tags": [ + "Gheg", + "Northern" + ] + }, + { + "ipa": "[ˈɡuː(h)]", + "tags": [ + "Gheg", + "Northern" + ] + }, + { + "ipa": "[ˈɡũː]", + "tags": [ + "Kosovo" + ] + }, + { + "ipa": "[ˈɡʎuhə]", + "tags": [ + "Arbëresh", + "Arvanitika" + ] + }, + { + "ipa": "[ˈɡʎuɣə]", + "tags": [ + "Calabria" + ] + } + ], + "senses": [ + { + "glosses": [ + { + "type": "structured-content", + "content": [ + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": "tongue (organ)" + }, + { + "tag": "div", + "data": { + "listType": "ol" + }, + "style": { + "marginLeft": 2 + }, + "content": [ + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": [ + { + "tag": "span", + "data": { + "listType": "number" + }, + "content": "1. " + }, + "(figurative) speech, talking" + ] + }, + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": [ + { + "tag": "span", + "data": { + "listType": "number" + }, + "content": "2. " + }, + "strip of land" + ] + }, + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": [ + { + "tag": "span", + "data": { + "listType": "number" + }, + "content": "3. " + }, + "bell clapper, clanger, tongue" + ] + } + ] + } + ] + } + ], + "tags": [ + "feminine" + ] + }, + { + "glosses": [ + { + "type": "structured-content", + "content": [ + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": "language, tongue" + }, + { + "tag": "div", + "data": { + "listType": "ol" + }, + "style": { + "marginLeft": 2 + }, + "content": [ + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": [ + { + "tag": "span", + "data": { + "listType": "number" + }, + "content": "1. " + }, + "register, speech, style" + ] + }, + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": [ + { + "tag": "span", + "data": { + "listType": "number" + }, + "content": "2. " + }, + "language (generally, any form of communication)" + ] + }, + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": [ + { + "tag": "span", + "data": { + "listType": "number" + }, + "content": "3. " + }, + "(colloquial) local dialect" + ] + }, + { + "tag": "div", + "data": { + "listType": "li" + }, + "content": [ + { + "tag": "span", + "data": { + "listType": "number" + }, + "content": "4. " + }, + "(colloquial) Albanian, as a subject in school" + ] + } + ] + } + ] + } + ], + "tags": [ + "feminine" + ] + } + ] + } + } } } \ No newline at end of file diff --git a/jsconfig.json b/jsconfig.json new file mode 100644 index 0000000..609e7cc --- /dev/null +++ b/jsconfig.json @@ -0,0 +1,21 @@ +{ + "compilerOptions": { + "module": "ES2022", + "target": "ES2022", + "checkJs": true, + "strict": true, + "strictNullChecks": true, + "noImplicitAny": true, + "strictPropertyInitialization": true, + "suppressImplicitAnyIndexErrors": false + }, + "paths": { + "*": ["./types/*"], + "ext/json-schema": ["./types/ext/json-schema"] + }, + "exclude": [ + "node_modules", + "**/node_modules/*" + ] +} + \ No newline at end of file diff --git a/types/types.ts b/types/types.ts new file mode 100644 index 0000000..ab1e7ab --- /dev/null +++ b/types/types.ts @@ -0,0 +1,106 @@ +declare global { + type TidyEnv = { + source_iso: string, + target_iso: string, + kaikki_file: string, + tidy_folder: string, + } + + type KaikkiLine = { + head_templates?: HeadTemplate[]; + word?: string; + pos?: string; + sounds?: Sound[]; + forms?: FormInfo[]; + senses?: KaikkiSense[]; + } + + type HeadTemplate = { + name?: string; + args?: string[]; + } + + type Sound = { + ipa?: string|string[]; + tags?: string[]; + note?: string; + } + + type FormInfo = { + form?: string; + tags?: string[]; + } + + type KaikkiSense = { + glosses?: Glosses; + raw_glosses?: Glosses; + raw_gloss?: Glosses; + tags?: string[]; + raw_tags?: string[]; + form_of?: FormOf[]; + } + + type Glosses = string | string[]; + + type FormOf = { + word?: string; + } + + type GlossTree = Map & { + get(key: '_tags'): string[] | undefined; + set(key: '_tags', value: string[]): GlossTree; + }; + + type TidySense = Omit & { + tags: string[]; + glossesArray: string[]; + } + + type LemmaDict = { + [word: string]: { + [reading: string]: { + [pos: string]: LemmaInfo + } + } + } + + type LemmaInfo = { + ipa: IpaInfo[], + senses: SenseInfo[], + } + + type IpaInfo = { + ipa: string, + tags: string[], + } + + type SenseInfo = { + glosses: YomitanGloss[], + tags: string[], + } + + type YomitanGloss = string | StructuredGloss + + type StructuredGloss = { + type: "structured-content", + content: string | StructuredContent[], + } + + type StructuredContent = { + tag: string, + data: string, + content: StructuredContent, + } + + type Lemma = string; + type Form = string; + type PoS = string; + type FormsMap = Map>>; + type AutomatedForms = Map|string[]>>>; + + type NestedObject = { + [key: string]: NestedObject | any; + } +} + +export {} // This is needed to make this file a module \ No newline at end of file diff --git a/util/util.js b/util/util.js index 1f3aede..09f3499 100644 --- a/util/util.js +++ b/util/util.js @@ -1,3 +1,4 @@ +//@ts-nocheck const path = require('path'); const { readFileSync, writeFileSync, existsSync } = require('fs'); const date = require('date-and-time'); @@ -35,7 +36,10 @@ function sortTags(targetIso, tags) { } // sorts inflection entries to be nearby similar inflections - +/** + * @param {string[]} tags + * @returns {string[]} + */ function similarSort(tags) { return tags.sort((a, b) => { const aWords = a.split(' '); @@ -62,6 +66,11 @@ function similarSort(tags) { // input: ['first-person singular present', 'third-person singular present'] // output: ['first/third-person singular present'] +/** + * @param {string} targetIso + * @param {string[]} tags + * @returns {string[]} + */ function mergePersonTags(targetIso, tags) { const persons = ["first-person", "second-person", "third-person"]; @@ -70,7 +79,9 @@ function mergePersonTags(targetIso, tags) { return items.sort((a, b) => persons.indexOf(a) - persons.indexOf(b)); } + /** @type {string[]} */ const result = []; + /** @type {Object} */ const mergeObj = {}; for (const item of tags) {