-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add script to merge ipa from all wiktionaries for a language (#6)
* wip * rename * finish? * delete empty file
- Loading branch information
1 parent
9c7ad11
commit 02c810d
Showing
8 changed files
with
148 additions
and
54 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,11 @@ | ||
[ | ||
["🏴", "dialect", 0, "Received-Pronunciation", 0], | ||
["🇬🇧", "dialect", 0, "UK", 0], | ||
["🇺🇸", "dialect", 0, "US", 0], | ||
["🇺🇸", "dialect", 0, ["US", "General-American"], 0], | ||
["🇦🇺", "dialect", 0, "General-Australian", 0], | ||
["🇺🇸", "dialect", 0, "General-American", 0], | ||
["🇳🇿", "dialect", 0, "New-Zealand", 0], | ||
["🇨🇦", "dialect", 0, "Canada", 0], | ||
["🏴", "dialect", 0, "Scotland", 0], | ||
["🏴", "dialect", 0, "Scottish", 0], | ||
["🏴", "dialect", 0, ["Scotland", "Scottish"], 0], | ||
["🇮🇪", "dialect", 0, "Ireland", 0], | ||
["ccm", "dialect", 0, "cot-caught-merger", 0] | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
const StreamZip = require('node-stream-zip'); | ||
const { execSync } = require('child_process'); | ||
const { readdirSync, existsSync, readFileSync, writeFileSync, unlinkSync } = require('fs'); | ||
const { writeInBatches } = require('./util/util'); | ||
const date = require('date-and-time'); | ||
const now = new Date(); | ||
|
||
async function main(){ | ||
const languages = JSON.parse(readFileSync('languages.json', 'utf8')); | ||
|
||
for (const {iso: sourceIso} of languages){ | ||
const globalIpa = {}; | ||
let globalTags = []; | ||
|
||
for (const {iso: targetIso} of languages){ | ||
let localIpa = []; | ||
let localTags = []; | ||
|
||
const file = `data/language/${sourceIso}/${targetIso}/kty-${sourceIso}-${targetIso}-ipa.zip`; | ||
if (existsSync(file)) { | ||
console.log("found", file); | ||
const zip = new StreamZip.async({ file }); | ||
const files = Object.keys(await zip.entries()); | ||
for (const file of files) { | ||
if(file.startsWith("term_meta_bank_")){ | ||
const data = await zip.entryData(file); | ||
const json = JSON.parse(data); | ||
localIpa = localIpa.concat(json); | ||
} | ||
if(file.startsWith("tag_bank_")){ | ||
const data = await zip.entryData(file); | ||
const json = JSON.parse(data); | ||
localTags = localTags.concat(json); | ||
} | ||
} | ||
|
||
console.log("localIpa", localIpa.length); | ||
console.log("localTags", localTags.length); | ||
|
||
await zip.close(); | ||
|
||
for (const local of localIpa) { | ||
const [term] = local | ||
if(!globalIpa[term]){ | ||
globalIpa[term] = local; | ||
} else { | ||
const existingIpas = globalIpa[term][2]['transcriptions'] | ||
const newIpas = local[2]['transcriptions'] | ||
|
||
for (const newIpa of newIpas) { | ||
const existingIpa = existingIpas.find(({ipa}) => ipa === newIpa.ipa); | ||
if(!existingIpa){ | ||
existingIpas.push(newIpa); | ||
const newTags = newIpa.tags.map(tag => localTags.find(([tagId]) => tagId === tag)); | ||
for (const newTag of newTags) { | ||
if(!globalTags.find(([tagId]) => tagId === newTag[0])){ | ||
globalTags.push(newTag); | ||
} | ||
} | ||
} else { | ||
const newTags = newIpa.tags.filter(tag => !existingIpa.tags.includes(tag)); | ||
for (const newTag of newTags) { | ||
existingIpa.tags.push(newTag); | ||
const fullTag = localTags.find(([tagId]) => tagId === newTag); | ||
if(!globalTags.find(([tagId]) => tagId === fullTag[0])){ | ||
globalTags.push(fullTag); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
const globalIpaLength = Object.keys(globalIpa).length; | ||
if(globalIpaLength) console.log("globalIpa", globalIpaLength); | ||
const globalTagsLength = globalTags.length; | ||
if(globalTagsLength) console.log("globalTags", globalTagsLength); | ||
|
||
const globalIndex = { | ||
"format": 3, | ||
"revision": date.format(now, 'YYYY.MM.DD'), | ||
"sequenced": true, | ||
"title": `kty-${sourceIso}-ipa` | ||
} | ||
|
||
if(globalIpaLength){ | ||
|
||
for (const file of readdirSync('data/temp/ipa')) { | ||
unlinkSync(`data/temp/ipa/${file}`); | ||
} | ||
|
||
writeFileSync(`data/temp/ipa/index.json`, JSON.stringify(globalIndex, null, 4)); | ||
writeInBatches('data/temp/ipa', Object.values(globalIpa), 'term_meta_bank_', 500000); | ||
writeInBatches('data/temp/ipa', globalTags, 'tag_bank_', 50000); | ||
|
||
execSync(`zip -j data/language/${sourceIso}/kty-${sourceIso}-ipa.zip data/temp/ipa/*`); | ||
} | ||
} | ||
} | ||
|
||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters