Skip to content

Commit

Permalink
📚 improve french target language
Browse files Browse the repository at this point in the history
  • Loading branch information
StefanVukovic99 committed Jan 12, 2024
1 parent f353fb2 commit 60b6649
Show file tree
Hide file tree
Showing 4 changed files with 180 additions and 114 deletions.
2 changes: 1 addition & 1 deletion 2-extract-language.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
source_iso = os.environ.get("source_iso")
target_iso = os.environ.get("target_iso")

input_file = f"data/kaikki/{source_iso}-extract.json"
input_file = f"data/kaikki/{target_iso}-extract.json"
output_file = f"data/kaikki/{source_iso}-{target_iso}-extract.json"

print(f"Reading {input_file} and writing {output_file}...")
Expand Down
91 changes: 58 additions & 33 deletions 2-tidy-up.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
import json
import os
import re

source_iso = os.environ.get("source_iso")
target_iso = os.environ.get("target_iso")
kaikki_file = os.environ.get("kaikki_file")

def isInflectionGloss(glosses):
if(target_iso == 'en'):
return re.match(r".*inflection of.*", json.dumps(glosses))
elif(target_iso == 'fr'):
if re.match(r"(.*)du verbe\s+((?:(?!\bdu\b).)*)$", json.dumps(glosses)):
return True
if re.search(r"((?:(?:Masculin|Féminin)\s)?(?:(?:p|P)luriel|(?:s|S)ingulier)) de ([^\s]+)", json.dumps(glosses)):
return True
return False

def handle_level(nest, level):
nest_defs = []
def_index = 0
Expand Down Expand Up @@ -57,6 +68,15 @@ def handle_nest(nested_gloss_obj, sense):
form_stuff = []
automated_forms = {}

def addDeinflections(form_dict, word, pos, lemma, inflections):
if(target_iso == 'fr'):
word = re.sub(r"(qu\')?(ils/elles|il/elle/on)\s*", '', word)
form_dict[word] = form_dict.get(word, {})
form_dict[word][lemma] = form_dict[word].get(lemma, {})
form_dict[word][lemma][pos] = form_dict[word][lemma].get(pos, [])

form_dict[word][lemma][pos].extend(inflections)

with open(f'data/kaikki/{kaikki_file}') as file:
for line in file:
line_count += 1
Expand Down Expand Up @@ -102,7 +122,7 @@ def handle_nest(nested_gloss_obj, sense):
if form_of:
form_stuff.append([word, sense, pos])
else:
if 'inflection of ' not in json.dumps(glosses):
if not isInflectionGloss(glosses):
lemma_dict[word] = lemma_dict.get(word, {})
lemma_dict[word][pos] = lemma_dict[word].get(pos, {})
lemma_dict[word][pos]['ipa'] = lemma_dict[word][pos].get('ipa', [])
Expand Down Expand Up @@ -139,26 +159,36 @@ def handle_nest(nested_gloss_obj, sense):

if curr_sense['glosses']:
lemma_dict[word][pos]['senses'].append(curr_sense)

if 'inflection of ' in json.dumps(glosses):
lemma = sense['glosses'][0]\
.replace('.+(?=inflection of)', '')\
.replace(' \\(.+?\\)', '')\
.replace(':$', '')\
.replace(':\\n.+', '')\
.replace('inflection of ', '')\
.replace(':.+', '')\
.strip()

inflection = sense['glosses'][1] if len(sense['glosses']) > 1 else ''

if inflection and 'inflection of ' not in inflection and word != lemma:
form_dict[word] = form_dict.get(word, {})
form_dict[word][lemma] = form_dict[word].get(lemma, {})
form_dict[word][lemma][pos] = form_dict[word][lemma].get(pos, [])

form_dict[word][lemma][pos].append(inflection)
sense_index += 1
else:
if(target_iso == 'en'):
lemma = re.sub(r'.+(?=inflection of)', '', sense['glosses'][0])
lemma = re.sub(r' \(.+?\)', '', lemma)
lemma = re.sub(r':$', '', lemma)
lemma = re.sub(r':\n.+', '', lemma)
lemma = re.sub(r'inflection of ', '', lemma)
lemma = re.sub(r':.+', '', lemma)
lemma = lemma.strip()

inflection = sense['glosses'][1] if len(sense['glosses']) > 1 else ''

if inflection and 'inflection of ' not in inflection and word != lemma:
addDeinflections(form_dict, word, pos, lemma, [inflection])

elif(target_iso == 'fr'):
inflection, lemma = None, None

if regexMatch := re.match(r"(.*)du verbe\s+((?:(?!\bdu\b).)*)$", sense['glosses'][0]):
inflection, lemma = regexMatch.group(1), regexMatch.group(2)

elif regexMatch := re.match(r"^((?:(?:Masculin|Féminin)\s)?(?:(?:p|P)luriel|(?:s|S)ingulier)) de ([^\s]*)$", sense['glosses'][0].strip()):
inflection, lemma = regexMatch.group(1), regexMatch.group(2)

if inflection and lemma:
inflection = inflection.strip()
lemma = re.sub(r'\.$', '', lemma).strip()

if inflection and word != lemma:
addDeinflections(form_dict, word, pos, lemma, [inflection])

print(f"Processed {line_count} lines...")

Expand All @@ -168,14 +198,10 @@ def handle_nest(nested_gloss_obj, sense):
lemma = form_of[0]['word']

if form != lemma and glosses:
form_dict[form] = form_dict.get(form, {})
form_dict[form][lemma] = form_dict[form].get(lemma, {})
form_dict[form][lemma][pos] = form_dict[form][lemma].get(pos, [])

if not "##" in glosses[0]:
form_dict[form][lemma][pos].append(glosses[0])
addDeinflections(form_dict, form, pos, lemma, [glosses[0]])
elif len(glosses) > 1:
form_dict[form][lemma][pos].append(glosses[1])
addDeinflections(form_dict, form, pos, lemma, [glosses[1]])

missing_forms = 0

Expand All @@ -187,18 +213,17 @@ def handle_nest(nested_gloss_obj, sense):
for lemma, parts in info.items():
for pos, glosses in parts.items():
if form != lemma:
form_dict[form] = form_dict.get(form, {})
form_dict[form][lemma] = form_dict[form].get(lemma, {})
form_dict[form][lemma][pos] = form_dict[form][lemma].get(pos, [])

form_dict[form][lemma][pos].extend([f"-automated- {gloss}" for gloss in glosses])
inflections = [f"-automated- {gloss}" for gloss in glosses]
addDeinflections(form_dict, form, pos, lemma, inflections)

print(f"There were {missing_forms} missing forms that have now been automatically populated.")

print(f"Writing lemma dict to data/tidy/{source_iso}-{target_iso}-lemmas.json...")
with open(f"data/tidy/{source_iso}-{target_iso}-lemmas.json", "w") as f:
json.dump(lemma_dict, f)

print(f"Writing form dict to data/tidy/{source_iso}-{target_iso}-forms.json...")
with open(f"data/tidy/{source_iso}-{target_iso}-forms.json", "w") as f:
json.dump(form_dict, f)

print('2-tidy-up.py finished.')
print('2-tidy-up.py finished.')
6 changes: 6 additions & 0 deletions auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ for entry in "${entries[@]}"; do
continue
fi

target_languages="es de en fr ru zh"
if [[ ! "$target_languages" == *"$target_iso"* ]]; then
echo "Unsupported target language: $target_iso"
continue
fi

export target_iso="$target_iso"
export target_language="$target_language_name"

Expand Down
195 changes: 115 additions & 80 deletions util/kaikki-breakdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,94 +2,129 @@
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os

counter = {}
for target_iso in ['de', 'es', 'ru', 'zh', 'fr']:
print(f"Processing {target_iso}...")
counter[target_iso] = {}
with open(f'../data/kaikki/{target_iso}-extract.json', "r", encoding="utf-8") as f:
line_count = 0
print_interval = 1000

for line in f:
line_count += 1
if line_count % print_interval == 0:
print(f"Processed {line_count} lines...", end='\r')
try:
obj = json.loads(line.strip())
except json.JSONDecodeError:
print(f"Error decoding JSON in line {line_count}. Skipping...")
continue

if "lang_code" in obj:
counter[target_iso][obj["lang_code"]] = counter[target_iso].get(obj["lang_code"], 0) + 1
else:
if "redirect" in obj:
counter[target_iso]["redirect"] = counter[target_iso].get("redirect", 0) + 1
else:
counter[target_iso]["error"] = counter[target_iso].get("error", 0) + 1
print(json.dumps(counter[target_iso], indent=4))

# open every file that starts with kaikki in the kaikki folder
print(f"Processing en...")
counter["en"] = {}
for file in os.listdir("../data/kaikki"):
if file.startswith("kaikki"):
print(f"Processing {file}...")
with open(f"../data/kaikki/{file}", "r", encoding="utf-8") as f:
line_count = 0
print_interval = 1000

for line in f:
line_count += 1
if line_count % print_interval == 0:
print(f"Processed {line_count} lines...", end='\r')
try:
obj = json.loads(line.strip())
except json.JSONDecodeError:
print(f"Error decoding JSON in line {line_count}. Skipping...")
continue

if "lang_code" in obj:
counter["en"][obj["lang_code"]] = counter["en"].get(obj["lang_code"], 0) + 1
else:
if "redirect" in obj:
counter["en"]["redirect"] = counter["en"].get("redirect", 0) + 1
else:
counter["en"]["error"] = counter["en"].get("error", 0) + 1

for target_iso in counter:
for target_iso2 in counter:
for source_iso in counter[target_iso]:
if source_iso not in counter[target_iso2]:
counter[target_iso2][source_iso] = 0

for target_iso in counter:
if "error" in counter[target_iso]:
del counter[target_iso]["error"]
if "redirect" in counter[target_iso]:
del counter[target_iso]["redirect"]
counter[target_iso] = {k: v for k, v in sorted(counter[target_iso].items(), key=lambda item: item[0])}

heatmap_data = [[counter[key1].get(key2, 0) for key2 in counter[key1]] for key1 in counter]

df = pd.DataFrame(heatmap_data, index=list(counter.keys()), columns=list(counter["de"].keys()))

# Sort rows and columns by their sum
size = 20

# import os
# counter = {}
# for target_iso in ['de', 'es', 'ru', 'zh', 'fr']:
# print(f"Processing {target_iso}...")
# counter[target_iso] = {}
# with open(f'../data/kaikki/{target_iso}-extract.json', "r", encoding="utf-8") as f:
# line_count = 0
# print_interval = 1000

# for line in f:
# line_count += 1
# if line_count % print_interval == 0:
# print(f"Processed {line_count} lines...", end='\r')
# try:
# obj = json.loads(line.strip())
# except json.JSONDecodeError:
# print(f"Error decoding JSON in line {line_count}. Skipping...")
# continue

# if "lang_code" in obj:
# counter[target_iso][obj["lang_code"]] = counter[target_iso].get(obj["lang_code"], 0) + 1
# else:
# if "redirect" in obj:
# counter[target_iso]["redirect"] = counter[target_iso].get("redirect", 0) + 1
# else:
# counter[target_iso]["error"] = counter[target_iso].get("error", 0) + 1
# print(json.dumps(counter[target_iso], indent=4))

# print(f"Processing en...")
# counter["en"] = {}
# for file in os.listdir("../data/kaikki"):
# if file.startswith("kaikki"):
# print(f"Processing {file}...")
# with open(f"../data/kaikki/{file}", "r", encoding="utf-8") as f:
# line_count = 0
# print_interval = 1000

# for line in f:
# line_count += 1
# if line_count % print_interval == 0:
# print(f"Processed {line_count} lines...", end='\r')
# try:
# obj = json.loads(line.strip())
# except json.JSONDecodeError:
# print(f"Error decoding JSON in line {line_count}. Skipping...")
# continue

# if "lang_code" in obj:
# counter["en"][obj["lang_code"]] = counter["en"].get(obj["lang_code"], 0) + 1
# else:
# if "redirect" in obj:
# counter["en"]["redirect"] = counter["en"].get("redirect", 0) + 1
# else:
# counter["en"]["error"] = counter["en"].get("error", 0) + 1

# for target_iso in counter:
# for target_iso2 in counter:
# for source_iso in counter[target_iso]:
# if source_iso not in counter[target_iso2]:
# counter[target_iso2][source_iso] = 0

# for target_iso in counter:
# if "error" in counter[target_iso]:
# del counter[target_iso]["error"]
# if "redirect" in counter[target_iso]:
# del counter[target_iso]["redirect"]
# counter[target_iso] = {k: v for k, v in sorted(counter[target_iso].items(), key=lambda item: item[0])}

# heatmap_data = [[counter[key1].get(key2, 0) for key2 in counter[key1]] for key1 in counter]

# source_languages = list(counter.keys())
# target_languages = list(counter["de"].keys())

# with open('heatmap_data.json', 'w') as f:
# json.dump(heatmap_data, f)

# with open('source_languages.json', 'w') as f:
# json.dump(source_languages, f)

# with open('target_languages.json', 'w') as f:
# json.dump(target_languages, f)

with open('heatmap_data.json', 'r') as f:
heatmap_data = json.load(f)

with open('source_languages.json', 'r') as f:
source_languages = json.load(f)

with open('target_languages.json', 'r') as f:
target_languages = json.load(f)

annotations = []
for row in heatmap_data:
new_row = []
for cell in row:
if cell < 1000:
new_row.append(str(cell))
else:
rounded_value = int(round(cell / 1000, 0))
new_row.append(f"{rounded_value}k")
annotations.append(new_row)

size = 25

df = pd.DataFrame(heatmap_data, index=source_languages, columns=target_languages)
df = df.loc[df.sum(axis=1).sort_values(ascending=False).head(size).index]
df = df[df.sum().sort_values(ascending=False).head(size).index]

annotations = pd.DataFrame(annotations, index=source_languages, columns=target_languages)
annotations = annotations.loc[df.sum(axis=1).sort_values(ascending=False).head(size).index]
annotations = annotations[df.sum().sort_values(ascending=False).head(size).index]

# Set a larger figure size
plt.figure(figsize=(16, 4))
plt.figure(figsize=(15, 4))

# Create a heatmap using seaborn
sns.heatmap(df, annot=True, fmt="d", cmap="YlGnBu", annot_kws={"size": 8}, vmin=10000, vmax=200000)
sns.heatmap(df, annot=annotations, cmap="YlGnBu", annot_kws={"size": 7}, fmt="s", vmax=150000, cbar_kws={'label': 'number of words'})

# Add labels and title
plt.xlabel("Source Language", fontsize=14)
plt.ylabel("Target Language", fontsize=14)
plt.xlabel("Source Language (headwords in this language)", fontsize=8)
plt.ylabel("Target Language (glosses in this language)", fontsize=8)
plt.title("yzkW", fontsize=12)

# Save the plot with a higher resolution
plt.savefig("heatmap.png", dpi=300)

0 comments on commit 60b6649

Please sign in to comment.