yomidevs · ThatsItForTheOtherOne · Sep 22, 2024 · Sep 24, 2024 · Nov 5, 2024 · Nov 30, 2024
@@ -40,6 +40,9 @@ import {albanianTransforms} from './sq/albanian-transforms.js';
 import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';
 import {tagalogTransforms} from './tl/tagalog-transforms.js';
 import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';
+import {convertFinalLetters, convertYiddishLigatures} from './yi/yiddish-text-postprocessors.js';
+import {combineYiddishLigatures, removeYiddishDiacritics} from './yi/yiddish-text-preprocessors.js';
+import {yiddishTransforms} from './yi/yiddish-transforms.js';
 import {isStringPartiallyChinese, normalizePinyin} from './zh/chinese.js';
 
 const capitalizationPreprocessors = {
@@ -369,6 +372,21 @@ const languageDescriptors = [
             normalizeDiacritics,
         },
     },
+    {
+        iso: 'yi',
+        iso639_3: 'yid',
+        name: 'Yiddish',
+        exampleText: 'באַשאַפֿן',
+        textPreprocessors: {
+            removeYiddishDiacritics,
+            combineYiddishLigatures,
+        },
+        textPostprocessors: {
+            convertFinalLetters,
+            convertYiddishLigatures,
+        },
+        languageTransforms: yiddishTransforms,
+    },
     {
         iso: 'yue',
         iso639_3: 'yue',

@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2024  Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+
+const final_letter_map = new Map([
+    ['\u05de', '\u05dd'], // מ to ם
+    ['\u05e0', '\u05df'], // נ to ן
+    ['\u05e6', '\u05e5'], // צ to ץ
+    ['\u05e4', '\u05e3'], // פ to ף
+    ['\u05dB', '\u05da'], // כ to ך
+]);
+
+const ligatures = [
+    {lig: '\u05f0', split: '\u05d5' + '\u05d5'}, // װ -> וו
+    {lig: '\u05f1', split: '\u05d5' + '\u05d9'}, // ױ -> וי
+    {lig: '\u05f2', split: '\u05d9' + '\u05d9'}, // ײ -> יי
+    {lig: '\ufb1d', split: '\u05d9' + '\u05b4'}, // יִ -> יִ
+    {lig: '\ufb1f', split: '\u05d9' + '\u05d9' + '\u05b7'}, // ײַ -> ייַ
+    {lig: '\ufb2e', split: '\u05d0' + '\u05b7'}, // Pasekh alef
+    {lig: '\ufb2f', split: '\u05d0' + '\u05b8'}, // Komets alef
+];
+
+/** @type {import('language').TextProcessor<boolean>} */
+export const convertFinalLetters = {
+    name: 'Convert to Final Letters',
+    description: 'קויף → קויפֿ',
+    options: [true],
+    process: (str) => {
+        const len = str.length - 1;
+        if ([...final_letter_map.keys()].includes(str.charAt(len))) {
+            str = str.substring(0, len) + final_letter_map.get(str.substring(len));
+        }
+        return str;
+    },
+};
+
+/** @type {import('language').BidirectionalConversionPreprocessor} */
+export const convertYiddishLigatures = {
+    name: 'Split Ligatures',
+    description: 'וו → װ',
+    options: ['off', 'direct', 'inverse'],
+    process: (str, setting) => {
+        switch (setting) {
+            case 'off':
+                return str;
+            case 'direct':
+                for (const ligature of ligatures) {
+                    str = str.replace(ligature.lig, ligature.split);
+                }
+                return str;
+            case 'inverse':
+                for (const ligature of ligatures) {
+                    str = str.replace(ligature.split, ligature.lig);
+                }
+                return str;
+        }
+    },
+};
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2024  Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+const ligatures = [
+    {lig: '\u05f0', split: '\u05d5' + '\u05d5'}, // װ -> וו
+    {lig: '\u05f1', split: '\u05d5' + '\u05d9'}, // ױ -> וי
+    {lig: '\u05f2', split: '\u05d9' + '\u05d9'}, // ײ -> יי
+    {lig: '\ufb1d', split: '\u05d9' + '\u05b4'}, // יִ -> יִ
+    {lig: '\ufb1f', split: '\u05d9' + '\u05d9' + '\u05b7'}, // ײַ -> ייַ
+    {lig: '\ufb2e', split: '\u05d0' + '\u05b7'}, // Pasekh alef
+    {lig: '\ufb2f', split: '\u05d0' + '\u05b8'}, // Komets alef
+];
+
+/** @type {import('language').TextProcessor<boolean>} */
+export const combineYiddishLigatures = {
+    name: 'Combine Ligatures',
+    description: 'וו → װ',
+    options: [true],
+    process: (str) => {
+        for (const ligature of ligatures) {
+            str = str.replace(ligature.split, ligature.lig);
+        }
+        return str;
+    },
+};
+
+/** @type {import('language').TextProcessor<boolean>} */
+export const removeYiddishDiacritics = {
+    name: 'Remove Diacritics',
+    description: 'פאת → פֿאָתּ',
+    options: [true],
+    process: (str) => {
+        return str.replace(/[\u05B0-\u05C7]/g, '');
+    },
+};
@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2024  Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import {suffixInflection} from '../language-transforms.js';
+
+/** @typedef {keyof typeof conditions} Condition */
+
+const mutations = [
+    {new: '\u05e2', orig: '\ufb2e'}, // Ayin to pasekh alef
+    {new: '\u05e2', orig: '\ufb2f'}, // Ayin to komets alef
+    {new: '\u05e2', orig: '\u05D0'}, // Ayin to shumter alef
+    {new: '\u05f1', orig: '\u05e2'}, // Vov yud to ayin
+    {new: '\u05f2', orig: '\u05f1'}, // Tsvey yudn to Vov yud
+    {new: '\u05d9', orig: '\u05d5'}, // Yud to Vov
+];
+
+/**
+ * @param {string} inflectedSuffix
+ * @param {string} deinflectedSuffix
+ * @param {Condition[]} conditionsIn
+ * @param {Condition[]} conditionsOut
+ * @returns {import('language-transformer').SuffixRule<Condition>[]}
+ */
+function umlautMutationSuffixInflection(inflectedSuffix, deinflectedSuffix, conditionsIn, conditionsOut) {
+    const suffixRegExp = new RegExp(inflectedSuffix + '$');
+    return mutations.map((mutation) => (
+        {
+            type: 'suffix',
+            isInflected: suffixRegExp,
+            deinflected: deinflectedSuffix,
+            deinflect: (/** @type {string} */ text) => {
+                const match = new RegExp(/[\u05E2\u05F0\u05D0\uFB2E\u05F1\u05D5\u05F2\uFB1D\uFB1F\u05D9\uFB2F](?!.*[\u05E2\u05F0\u05D0\uFB2E\u05F1\u05D5\u05F2\uFB1D\uFB1F\u05D9\uFB2F])/).exec(text.slice(0, -inflectedSuffix.length));
+                return (match?.[0] !== mutation.new) ? '' : text.slice(0, match.index) + mutation.orig + text.slice(match.index + 1, -inflectedSuffix.length) + deinflectedSuffix;
+            },
+            conditionsIn,
+            conditionsOut,
+        }
+    ));
+}
+
+const conditions = {
+    v: {
+        name: 'Verb',
+        isDictionaryForm: true,
+        subConditions: ['vpast', 'vpresent'],
+    },
+    vpast: {
+        name: 'Verb, past tense',
+        isDictionaryForm: false,
+    },
+    vpresent: {
+        name: 'Verb, present tense',
+        isDictionaryForm: true,
+    },
+    n: {
+        name: 'Noun',
+        isDictionaryForm: true,
+        subConditions: ['np', 'ns'],
+    },
+    np: {
+        name: 'Noun, plural',
+        isDictionaryForm: false,
+    },
+    ns: {
+        name: 'Noun, singular',
+        isDictionaryForm: true,
+    },
+    adj: {
+        name: 'Adjective',
+        isDictionaryForm: true,
+    },
+    adv: {
+        name: 'Adverb',
+        isDictionaryForm: true,
+    },
+};
+
+/** @type {import('language-transformer').LanguageTransformDescriptor<Condition>} */
+export const yiddishTransforms = {
+    language: 'yi',
+    conditions,
+    transforms: {
+        plural: {
+            name: 'plural',
+            description: 'plural form of a noun',
+            rules: [
+                suffixInflection('\u05E1', '', ['np'], ['ns']), // -s
+                suffixInflection('\u05DF', '', ['np'], ['ns']), // -n
+                suffixInflection('\u05D9\u05DD', '', ['np'], ['ns']), // -im, hebrew
+                suffixInflection('\u05E2\u05E8', '', ['np'], ['ns']), // -er
+                suffixInflection('\u05E2\u05DA', '', ['np'], ['ns']), // -ekh
+                suffixInflection('\u05E2\u05DF', '', ['np'], ['ns']), // -en
+                suffixInflection('\u05E2\u05E1', '', ['np'], ['ns']), // -es
+                suffixInflection('\u05D5\u05EA', '', ['np'], ['ns']), // -ot, hebrew
+            ],
+        },
+        umlaut_plural: {
+            name: 'umlaut_plural',
+            description: 'plural form of a umlaut noun',
+            rules: [
+                ...umlautMutationSuffixInflection('\u05E2\u05E8', '', ['np'], ['ns']), // -er
+                ...umlautMutationSuffixInflection('\u05E2\u05E1', '', ['np'], ['ns']), // -es
+                ...umlautMutationSuffixInflection('\u05D9\u05DD', '', ['np'], ['ns']), // -im
+                ...umlautMutationSuffixInflection('\u05E2\u05DF', '', ['np'], ['ns']), // -en
+                ...umlautMutationSuffixInflection('\u05DF', '', ['np'], ['ns']), // -n
+            ],
+        },
+        diminutive: {
+            name: 'diminutive',
+            description: 'diminutive form of a noun',
+            rules: [
+                suffixInflection('\u05D8\u05E9\u05D9\u05E7', '', ['n'], ['n']), // -tshik
+                suffixInflection('\u05E7\u05E2', '', ['n'], ['n']), // -ke
+                suffixInflection('\u05DC', '', ['n'], ['n']), // -l
+                suffixInflection('\u05E2\u05DC\u05E2', '', ['n'], ['n']), // -ele
+            ],
+        },
+        diminutive_and_umlaut: {
+            name: 'diminutive_and_umlaut',
+            description: 'diminutive form of a noun with stem umlaut',
+            rules: [
+                ...umlautMutationSuffixInflection('\u05DC', '', ['n'], ['n']), // -l
+                ...umlautMutationSuffixInflection('\u05E2\u05DC\u05E2', '', ['n'], ['n']), // -ele
+            ],
+        },
+        verb_present_singular_to_first_person: {
+            name: 'verb_present_singular_to_first_person',
+            description: 'Turn the second and third person singular form to first person',
+            rules: [
+                suffixInflection('\u05E1\u05D8', '', ['v'], ['vpresent']), // -st
+                suffixInflection('\u05D8', '', ['v'], ['vpresent']), // -t
+                suffixInflection('\u05E0\u05D3\u05D9\u05E7', '', ['v'], ['vpresent']), // -ndik
+            ],
+        },
+        verb_present_plural_to_first_person: {
+            name: 'verb_present_plural_to_first_person',
+            description: 'Turn the second plural form to first person plural form',
+            rules: [
+                suffixInflection('\u05D8\u05E1', '\u05E0', ['v'], ['vpresent']), // -ts
+                suffixInflection('\u05D8', '\u05E0', ['v'], ['vpresent']), // -t
+            ],
+        },
+    },
+};
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2023-2024  Yomitan Authors
+ * Copyright (C) 2020-2022  Yomichan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import {LanguageTransformer} from '../../ext/js/language/language-transformer.js';
+import {yiddishTransforms} from '../../ext/js/language/yi/yiddish-transforms.js';
+import {testLanguageTransformer} from '../fixtures/language-transformer-test.js';
+
+/* Since Yiddish final letters are handled in a text postprocessor after all the transformations have been run, test case terms must never use the final form of a letter!
+Otherwise, it will fail even if the rule is correct! Sources require use of final letters however for plural deinflection */
+const tests = [
+    {
+        category: 'nouns',
+        valid: true,
+        tests: [
+            {term: 'גרופּע', source: 'גרופּעס', rule: 'ns', reasons: ['plural']}, // grupes -> gupe
+            {term: 'טיש', source: 'טישן', rule: 'ns', reasons: ['plural']}, // tishn -> tish
+            {term: 'פּויער', source: 'פּויערים', rule: 'ns', reasons: ['plural']}, // poyerim  -> poyer
+            {term: 'קינד', source: 'קינדער', rule: 'ns', reasons: ['plural']}, // kinder -> kind
+            {term: 'בענקל', source: 'בענקלעך', rule: 'ns', reasons: ['plural']}, // benklekh -> benkl
+            {term: 'באַנ', source: 'באַנען', rule: 'ns', reasons: ['plural']}, // banen -> ban
+            {term: 'נודניק', source: 'נודניקעס', rule: 'ns', reasons: ['plural']}, // nudnikes -> nudnik
+            {term: 'חלומ', source: 'חלומות', rule: 'ns', reasons: ['plural']}, // khlomos -> khlom
+            {term: 'עטיקעט', source: 'עטיקעטקע', rule: 'n', reasons: ['diminutive']}, // etiketke -> etiket
+            {term: 'קליענטעל', source: 'קליענטעלטשיק', rule: 'n', reasons: ['diminutive']}, // klienteltshik -> klientel
+            {term: 'קינדער', source: 'קינדערלעך', rule: 'ns', reasons: ['diminutive', 'plural']}, // kinderlekh -> kinder
+        ],
+    },
+    {
+        category: 'umlaut_nouns',
+        valid: true,
+        tests: [
+            {term: 'מאנ', source: 'מענער', rule: 'ns', reasons: ['umlaut_plural']}, // mener -> man
+            {term: 'טשוואק', source: 'טשוועקעס', rule: 'ns', reasons: ['umlaut_plural']}, // tshvekes -> tshvak
+            {term: 'מױד', source: 'מײדלעך', rule: 'ns', reasons: ['diminutive_and_umlaut', 'plural']}, // meydlekh -> moyd
+            {term: 'דאָקטער', source: 'דאָקטױרים', rule: 'ns', reasons: ['umlaut_plural']}, // doktoyrim -> dokter
+            {term: 'בלומ', source: 'בלימען', rule: 'ns', reasons: ['umlaut_plural']}, // blimen -> blum
+            {term: 'אומשטאנד', source: 'אומשטענדן', rule: 'ns', reasons: ['umlaut_plural']}, // umshtendn -> umshtand
+            {term: 'קאצ', source: 'קעצעלע', rule: 'n', reasons: ['diminutive_and_umlaut']}, // ketzele -> katz
+            {term: 'קאצ', source: 'קעצל', rule: 'n', reasons: ['diminutive_and_umlaut']}, // ketzl -> katz
+        ],
+    },
+    {
+        category: 'verbs',
+        valid: true,
+        tests: [
+            {term: 'קויפֿ', source: 'קויפֿסט', rule: 'v', reasons: ['verb_present_singular_to_first_person']},
+            {term: 'קויפֿ', source: 'קויפֿט', rule: 'vpresent', reasons: ['verb_present_singular_to_first_person']},
+            {term: 'קויפֿנ', source: 'קויפֿט', rule: 'vpresent', reasons: ['verb_present_plural_to_first_person']},
+            {term: 'קויפֿנ', source: 'קויפֿטס', rule: 'vpresent', reasons: ['verb_present_plural_to_first_person']},
+            {term: 'קויפֿנ', source: 'קויפֿטס', rule: 'vpresent', reasons: ['verb_present_plural_to_first_person']},
+        ],
+    },
+];
+
+
+const languageTransformer = new LanguageTransformer();
+languageTransformer.addDescriptor(yiddishTransforms);
+testLanguageTransformer(languageTransformer, tests);