📚 improve french target language

yomidevs · Jan 12, 2024 · 60b6649 · 60b6649
1 parent f353fb2
commit 60b6649
Show file tree

Hide file tree

Showing 4 changed files with 180 additions and 114 deletions.
diff --git a/2-extract-language.py b/2-extract-language.py
@@ -4,7 +4,7 @@
 source_iso = os.environ.get("source_iso")
 target_iso = os.environ.get("target_iso")
 
-input_file = f"data/kaikki/{source_iso}-extract.json"
+input_file = f"data/kaikki/{target_iso}-extract.json"
 output_file = f"data/kaikki/{source_iso}-{target_iso}-extract.json"
 
 print(f"Reading {input_file} and writing {output_file}...")

diff --git a/2-tidy-up.py b/2-tidy-up.py
@@ -1,10 +1,21 @@
 import json
 import os
+import re
 
 source_iso = os.environ.get("source_iso")
 target_iso = os.environ.get("target_iso")
 kaikki_file = os.environ.get("kaikki_file")
 
+def isInflectionGloss(glosses):
+    if(target_iso == 'en'):
+        return re.match(r".*inflection of.*", json.dumps(glosses))
+    elif(target_iso == 'fr'):
+        if re.match(r"(.*)du verbe\s+((?:(?!\bdu\b).)*)$", json.dumps(glosses)):
+            return True
+        if re.search(r"((?:(?:Masculin|Féminin)\s)?(?:(?:p|P)luriel|(?:s|S)ingulier)) de ([^\s]+)", json.dumps(glosses)):
+            return True
+    return False
+
 def handle_level(nest, level):
     nest_defs = []
     def_index = 0
@@ -57,6 +68,15 @@ def handle_nest(nested_gloss_obj, sense):
 form_stuff = []
 automated_forms = {}
 
+def addDeinflections(form_dict, word, pos, lemma, inflections):
+    if(target_iso == 'fr'):
+        word = re.sub(r"(qu\')?(ils/elles|il/elle/on)\s*", '', word)
+    form_dict[word] = form_dict.get(word, {})
+    form_dict[word][lemma] = form_dict[word].get(lemma, {})
+    form_dict[word][lemma][pos] = form_dict[word][lemma].get(pos, [])
+
+    form_dict[word][lemma][pos].extend(inflections)
+
 with open(f'data/kaikki/{kaikki_file}') as file:
     for line in file:
         line_count += 1
@@ -102,7 +122,7 @@ def handle_nest(nested_gloss_obj, sense):
                     if form_of:
                         form_stuff.append([word, sense, pos])
                     else:
-                        if 'inflection of ' not in json.dumps(glosses):
+                        if not isInflectionGloss(glosses):
                             lemma_dict[word] = lemma_dict.get(word, {})
                             lemma_dict[word][pos] = lemma_dict[word].get(pos, {})
                             lemma_dict[word][pos]['ipa'] = lemma_dict[word][pos].get('ipa', [])
@@ -139,26 +159,36 @@ def handle_nest(nested_gloss_obj, sense):
 
                             if curr_sense['glosses']:
                                 lemma_dict[word][pos]['senses'].append(curr_sense)
-
-                        if 'inflection of ' in json.dumps(glosses):
-                            lemma = sense['glosses'][0]\
-                                .replace('.+(?=inflection of)', '')\
-                                .replace(' \\(.+?\\)', '')\
-                                .replace(':$', '')\
-                                .replace(':\\n.+', '')\
-                                .replace('inflection of ', '')\
-                                .replace(':.+', '')\
-                                .strip()
-
-                            inflection = sense['glosses'][1] if len(sense['glosses']) > 1 else ''
-
-                            if inflection and 'inflection of ' not in inflection and word != lemma:
-                                form_dict[word] = form_dict.get(word, {})
-                                form_dict[word][lemma] = form_dict[word].get(lemma, {})
-                                form_dict[word][lemma][pos] = form_dict[word][lemma].get(pos, [])
-
-                                form_dict[word][lemma][pos].append(inflection)
-                sense_index += 1
+                        else:
+                            if(target_iso == 'en'):
+                                lemma = re.sub(r'.+(?=inflection of)', '', sense['glosses'][0])
+                                lemma = re.sub(r' \(.+?\)', '', lemma)
+                                lemma = re.sub(r':$', '', lemma)
+                                lemma = re.sub(r':\n.+', '', lemma)
+                                lemma = re.sub(r'inflection of ', '', lemma)
+                                lemma = re.sub(r':.+', '', lemma)
+                                lemma = lemma.strip()
+
+                                inflection = sense['glosses'][1] if len(sense['glosses']) > 1 else ''
+
+                                if inflection and 'inflection of ' not in inflection and word != lemma:
+                                    addDeinflections(form_dict, word, pos, lemma, [inflection])
+
+                            elif(target_iso == 'fr'):
+                                inflection, lemma = None, None
+
+                                if regexMatch := re.match(r"(.*)du verbe\s+((?:(?!\bdu\b).)*)$", sense['glosses'][0]):
+                                    inflection, lemma = regexMatch.group(1), regexMatch.group(2)
+
+                                elif regexMatch := re.match(r"^((?:(?:Masculin|Féminin)\s)?(?:(?:p|P)luriel|(?:s|S)ingulier)) de ([^\s]*)$", sense['glosses'][0].strip()):
+                                    inflection, lemma = regexMatch.group(1), regexMatch.group(2)
+
+                                if inflection and lemma:
+                                    inflection = inflection.strip()
+                                    lemma = re.sub(r'\.$', '', lemma).strip()
+
+                                    if inflection and word != lemma:
+                                        addDeinflections(form_dict, word, pos, lemma, [inflection])
 
 print(f"Processed {line_count} lines...")
 
@@ -168,14 +198,10 @@ def handle_nest(nested_gloss_obj, sense):
     lemma = form_of[0]['word']
 
     if form != lemma and glosses:
-        form_dict[form] = form_dict.get(form, {})
-        form_dict[form][lemma] = form_dict[form].get(lemma, {})
-        form_dict[form][lemma][pos] = form_dict[form][lemma].get(pos, [])
-
         if not "##" in glosses[0]:
-            form_dict[form][lemma][pos].append(glosses[0])
+            addDeinflections(form_dict, form, pos, lemma, [glosses[0]])
         elif len(glosses) > 1:
-            form_dict[form][lemma][pos].append(glosses[1])
+            addDeinflections(form_dict, form, pos, lemma, [glosses[1]])
 
 missing_forms = 0
 
@@ -187,18 +213,17 @@ def handle_nest(nested_gloss_obj, sense):
             for lemma, parts in info.items():
                 for pos, glosses in parts.items():
                     if form != lemma:
-                        form_dict[form] = form_dict.get(form, {})
-                        form_dict[form][lemma] = form_dict[form].get(lemma, {})
-                        form_dict[form][lemma][pos] = form_dict[form][lemma].get(pos, [])
-
-                        form_dict[form][lemma][pos].extend([f"-automated- {gloss}" for gloss in glosses])
+                        inflections = [f"-automated- {gloss}" for gloss in glosses]
+                        addDeinflections(form_dict, form, pos, lemma, inflections)
 
 print(f"There were {missing_forms} missing forms that have now been automatically populated.")
 
+print(f"Writing lemma dict to data/tidy/{source_iso}-{target_iso}-lemmas.json...")
 with open(f"data/tidy/{source_iso}-{target_iso}-lemmas.json", "w") as f:
     json.dump(lemma_dict, f)
 
+print(f"Writing form dict to data/tidy/{source_iso}-{target_iso}-forms.json...")
 with open(f"data/tidy/{source_iso}-{target_iso}-forms.json", "w") as f:
     json.dump(form_dict, f)
 
-print('2-tidy-up.py finished.')
+print('2-tidy-up.py finished.')
diff --git a/auto.sh b/auto.sh
@@ -80,6 +80,12 @@ for entry in "${entries[@]}"; do
       continue
   fi
 
+    target_languages="es de en fr ru zh"
+    if [[ ! "$target_languages" == *"$target_iso"* ]]; then
+      echo "Unsupported target language: $target_iso"
+      continue
+    fi
+
   export target_iso="$target_iso"
   export target_language="$target_language_name"
 

diff --git a/util/kaikki-breakdown.py b/util/kaikki-breakdown.py
@@ -2,94 +2,129 @@
 import seaborn as sns
 import matplotlib.pyplot as plt
 import pandas as pd
-import os
-
-counter = {}
-for target_iso in ['de', 'es', 'ru', 'zh', 'fr']:
-    print(f"Processing {target_iso}...")
-    counter[target_iso] = {}
-    with open(f'../data/kaikki/{target_iso}-extract.json', "r", encoding="utf-8") as f:
-        line_count = 0
-        print_interval = 1000
-
-        for line in f:
-            line_count += 1
-            if line_count % print_interval == 0:
-                print(f"Processed {line_count} lines...", end='\r')
-            try:
-                obj = json.loads(line.strip())
-            except json.JSONDecodeError:
-                print(f"Error decoding JSON in line {line_count}. Skipping...")
-                continue
-
-            if "lang_code" in obj:
-                counter[target_iso][obj["lang_code"]] = counter[target_iso].get(obj["lang_code"], 0) + 1
-            else:
-                if "redirect" in obj:
-                    counter[target_iso]["redirect"] = counter[target_iso].get("redirect", 0) + 1
-                else:
-                    counter[target_iso]["error"] = counter[target_iso].get("error", 0) + 1
-    print(json.dumps(counter[target_iso], indent=4))
-
-# open every file that starts with kaikki in the kaikki folder
-print(f"Processing en...")
-counter["en"] = {}
-for file in os.listdir("../data/kaikki"):
-    if file.startswith("kaikki"):
-        print(f"Processing {file}...")
-        with open(f"../data/kaikki/{file}", "r", encoding="utf-8") as f:
-            line_count = 0
-            print_interval = 1000
-
-            for line in f:
-                line_count += 1
-                if line_count % print_interval == 0:
-                    print(f"Processed {line_count} lines...", end='\r')
-                try:
-                    obj = json.loads(line.strip())
-                except json.JSONDecodeError:
-                    print(f"Error decoding JSON in line {line_count}. Skipping...")
-                    continue
-
-                if "lang_code" in obj:
-                    counter["en"][obj["lang_code"]] = counter["en"].get(obj["lang_code"], 0) + 1
-                else:
-                    if "redirect" in obj:
-                        counter["en"]["redirect"] = counter["en"].get("redirect", 0) + 1
-                    else:
-                        counter["en"]["error"] = counter["en"].get("error", 0) + 1
-
-for target_iso in counter:
-    for target_iso2 in counter:
-        for source_iso in counter[target_iso]:
-            if source_iso not in counter[target_iso2]:
-                counter[target_iso2][source_iso] = 0
-
-for target_iso in counter:
-    if "error" in counter[target_iso]:
-        del counter[target_iso]["error"]
-    if "redirect" in counter[target_iso]:
-        del counter[target_iso]["redirect"]
-    counter[target_iso] = {k: v for k, v in sorted(counter[target_iso].items(), key=lambda item: item[0])}
-
-heatmap_data = [[counter[key1].get(key2, 0) for key2 in counter[key1]] for key1 in counter]
-
-df = pd.DataFrame(heatmap_data, index=list(counter.keys()), columns=list(counter["de"].keys()))
-
-# Sort rows and columns by their sum
-size = 20
+
+# import os
+# counter = {}
+# for target_iso in ['de', 'es', 'ru', 'zh', 'fr']:
+#     print(f"Processing {target_iso}...")
+#     counter[target_iso] = {}
+#     with open(f'../data/kaikki/{target_iso}-extract.json', "r", encoding="utf-8") as f:
+#         line_count = 0
+#         print_interval = 1000
+
+#         for line in f:
+#             line_count += 1
+#             if line_count % print_interval == 0:
+#                 print(f"Processed {line_count} lines...", end='\r')
+#             try:
+#                 obj = json.loads(line.strip())
+#             except json.JSONDecodeError:
+#                 print(f"Error decoding JSON in line {line_count}. Skipping...")
+#                 continue
+
+#             if "lang_code" in obj:
+#                 counter[target_iso][obj["lang_code"]] = counter[target_iso].get(obj["lang_code"], 0) + 1
+#             else:
+#                 if "redirect" in obj:
+#                     counter[target_iso]["redirect"] = counter[target_iso].get("redirect", 0) + 1
+#                 else:
+#                     counter[target_iso]["error"] = counter[target_iso].get("error", 0) + 1
+#     print(json.dumps(counter[target_iso], indent=4))
+
+# print(f"Processing en...")
+# counter["en"] = {}
+# for file in os.listdir("../data/kaikki"):
+#     if file.startswith("kaikki"):
+#         print(f"Processing {file}...")
+#         with open(f"../data/kaikki/{file}", "r", encoding="utf-8") as f:
+#             line_count = 0
+#             print_interval = 1000
+
+#             for line in f:
+#                 line_count += 1
+#                 if line_count % print_interval == 0:
+#                     print(f"Processed {line_count} lines...", end='\r')
+#                 try:
+#                     obj = json.loads(line.strip())
+#                 except json.JSONDecodeError:
+#                     print(f"Error decoding JSON in line {line_count}. Skipping...")
+#                     continue
+
+#                 if "lang_code" in obj:
+#                     counter["en"][obj["lang_code"]] = counter["en"].get(obj["lang_code"], 0) + 1
+#                 else:
+#                     if "redirect" in obj:
+#                         counter["en"]["redirect"] = counter["en"].get("redirect", 0) + 1
+#                     else:
+#                         counter["en"]["error"] = counter["en"].get("error", 0) + 1
+
+# for target_iso in counter:
+#     for target_iso2 in counter:
+#         for source_iso in counter[target_iso]:
+#             if source_iso not in counter[target_iso2]:
+#                 counter[target_iso2][source_iso] = 0
+
+# for target_iso in counter:
+#     if "error" in counter[target_iso]:
+#         del counter[target_iso]["error"]
+#     if "redirect" in counter[target_iso]:
+#         del counter[target_iso]["redirect"]
+#     counter[target_iso] = {k: v for k, v in sorted(counter[target_iso].items(), key=lambda item: item[0])}
+
+# heatmap_data = [[counter[key1].get(key2, 0) for key2 in counter[key1]] for key1 in counter]
+
+# source_languages = list(counter.keys())
+# target_languages = list(counter["de"].keys())
+
+# with open('heatmap_data.json', 'w') as f:
+#     json.dump(heatmap_data, f)
+
+# with open('source_languages.json', 'w') as f:
+#     json.dump(source_languages, f)
+
+# with open('target_languages.json', 'w') as f:
+#     json.dump(target_languages, f)
+
+with open('heatmap_data.json', 'r') as f:
+    heatmap_data = json.load(f)
+
+with open('source_languages.json', 'r') as f:
+    source_languages = json.load(f)
+
+with open('target_languages.json', 'r') as f:
+    target_languages = json.load(f)
+
+annotations = []
+for row in heatmap_data:
+    new_row = []
+    for cell in row:
+        if cell < 1000:
+            new_row.append(str(cell))
+        else:
+            rounded_value = int(round(cell / 1000, 0))
+            new_row.append(f"{rounded_value}k")
+    annotations.append(new_row)
+
+size = 25
+
+df = pd.DataFrame(heatmap_data, index=source_languages, columns=target_languages)
 df = df.loc[df.sum(axis=1).sort_values(ascending=False).head(size).index]
 df = df[df.sum().sort_values(ascending=False).head(size).index]
 
+annotations = pd.DataFrame(annotations, index=source_languages, columns=target_languages)
+annotations = annotations.loc[df.sum(axis=1).sort_values(ascending=False).head(size).index]
+annotations = annotations[df.sum().sort_values(ascending=False).head(size).index]
+
 # Set a larger figure size
-plt.figure(figsize=(16, 4))
+plt.figure(figsize=(15, 4))
 
 # Create a heatmap using seaborn
-sns.heatmap(df, annot=True, fmt="d", cmap="YlGnBu", annot_kws={"size": 8}, vmin=10000, vmax=200000)
+sns.heatmap(df, annot=annotations, cmap="YlGnBu", annot_kws={"size": 7}, fmt="s", vmax=150000, cbar_kws={'label': 'number of words'})
 
 # Add labels and title
-plt.xlabel("Source Language", fontsize=14)
-plt.ylabel("Target Language", fontsize=14)
+plt.xlabel("Source Language (headwords in this language)", fontsize=8)
+plt.ylabel("Target Language (glosses in this language)", fontsize=8)
+plt.title("yzkW", fontsize=12)
 
 # Save the plot with a higher resolution
 plt.savefig("heatmap.png", dpi=300)