Skip to content

Commit 6fc3dd7

Browse files
author
David Klinger
committedApr 6, 2022
adding etl
1 parent 0ede30d commit 6fc3dd7

File tree

4 files changed

+245
-0
lines changed

4 files changed

+245
-0
lines changed
 

‎.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
etl/data/*
2+
etl/__pycache__/*

‎etl/dictionary.py

+199
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
import extract
2+
import json
3+
4+
5+
class Ontolex_Word:
6+
7+
def __init__(self, word, data=None):
8+
self.word = word
9+
if data:
10+
self.data = data
11+
else:
12+
self.data = {}
13+
14+
def add_gloss(self, gloss, part_of_speech, can_exist=False):
15+
definition, translation = None, []
16+
if gloss in self.data:
17+
definition, translation = self.data[gloss]['def'], self.data[gloss]['translation']
18+
if can_exist or gloss not in self.data:
19+
self.data[gloss] = {
20+
'pos': part_of_speech,
21+
'def': definition,
22+
'translation': translation
23+
}
24+
25+
def add_translation(self, gloss, translation):
26+
if translation.endswith(' f') or translation.endswith(' m'):
27+
translation = translation[:-3]
28+
self.data[gloss]['translation'].append(translation)
29+
30+
def add_definition(self, gloss, definition):
31+
self.data[gloss]['def'] = definition
32+
33+
def get_translations(self):
34+
results = []
35+
for _, gloss_data in self.data.items():
36+
pos, definition, translations = gloss_data['pos'], gloss_data['def'], gloss_data['translation']
37+
for t in translations:
38+
if not definition:
39+
definition = self.word
40+
if self.word not in definition:
41+
definition = f"{self.word}, {definition}"
42+
if t in results:
43+
w = results[t]
44+
else:
45+
w = Word(t)
46+
w.add_definition(pos, definition)
47+
results.append(w)
48+
return results
49+
50+
def get_dict(self):
51+
return self.data
52+
53+
54+
class Ontolex:
55+
56+
def __init__(self, get_data=False, read=None):
57+
if get_data:
58+
extract.get_ontolex()
59+
self.words = {}
60+
if read:
61+
with open(f"data/{read}", 'r', encoding='utf-8') as f:
62+
data = json.loads(f.read())
63+
for w, o_w in data.items():
64+
self.words[w] = Ontolex_Word(w, o_w)
65+
else:
66+
self.parse_ontolex()
67+
68+
def get_word(self, word):
69+
if word not in self.words:
70+
self.words[word] = Ontolex_Word(word)
71+
return self.words[word]
72+
73+
def parse_ontolex(self):
74+
print('parsing ontolex data')
75+
with open('data/raw_dbnary_dump.ttl', 'r', encoding='utf-8-sig') as f:
76+
data = f.read().split('\n')
77+
n = len(data)
78+
divisor = 10 ** 6
79+
for i, line in enumerate(data):
80+
if i % divisor == 0:
81+
print(f"{i // divisor} of {n // divisor}")
82+
if 'eng:__en_gloss' in line:
83+
gloss = line.split(';')[0].split('>')[0].split('/')[-1].split('.')[0].split(':')[-1].strip()
84+
vals = [x.replace('_', ' ').strip() for x in '_'.join(gloss.split('_')[5:]).split('__')]
85+
word = vals[0]
86+
new_word = word
87+
part_of_speech = vals[1] if len(vals) > 1 else None
88+
self.get_word(word).add_gloss(gloss, part_of_speech)
89+
if 'dbnary:isTranslationOf' in line:
90+
translation = line.split(';')[0].split('>')[0].split('/')[-1].split('.')[0].split(':')[-1].strip().replace('__en_gloss', '')
91+
vals = [x.replace('_', ' ').strip() for x in translation.split('__')]
92+
new_word = vals[0]
93+
if new_word == word:
94+
part_of_speech = vals[1] if len(vals) > 1 else None
95+
self.get_word(word).add_gloss(gloss, part_of_speech)
96+
if '@uk' in line:
97+
translation = line.split('@')[0].replace('\\\"', '*').split("\"")[1].replace('*', '\\\"').replace('[','').replace(']','')
98+
translation = " ".join([x.split('|')[0] for x in translation.split(' ')])
99+
if new_word == word:
100+
self.get_word(word).add_translation(gloss, translation)
101+
if 'rdf:value' in line and "@en" in line and '[' not in line:
102+
definition = line.split('@')[0].replace('\\\"', '*').split("\"")[1].replace('*', '\\\"')
103+
if new_word == word:
104+
self.get_word(word).add_definition(gloss, definition)
105+
print('parsing complete')
106+
107+
108+
def get_dictionary(self):
109+
dict = Dictionary()
110+
for _, word in self.words.items():
111+
translations = word.get_translations()
112+
dict.add_to_dictionary(translations)
113+
return dict
114+
115+
def get_dict(self):
116+
d = {}
117+
for w in self.words:
118+
d[w] = self.words[w].get_dict()
119+
return d
120+
121+
def dump(self, loc, indent=None):
122+
with open(f'data/{loc}', 'w+', encoding='utf-8') as f:
123+
if indent:
124+
f.write(
125+
json.dumps(self.get_dict(), indent=indent, ensure_ascii=False)
126+
)
127+
else:
128+
f.write(
129+
json.dumps(self.get_dict(), ensure_ascii=False)
130+
)
131+
132+
133+
class Usage:
134+
135+
def __init__(self, word, pos):
136+
self.word = word
137+
self.pos = pos
138+
self.definitions = {}
139+
140+
def add_definitions(self, definitions):
141+
for d in definitions:
142+
self.add_definition(d)
143+
144+
def add_definition(self, definition):
145+
self.definitions[definition] = None
146+
147+
def get_definitions(self):
148+
return list(self.definitions)
149+
150+
def merge(self, other):
151+
self.add_definitions(
152+
[item for pair in zip(self.get_definitions(), other.get_definitions()) for item in pair]
153+
)
154+
155+
156+
class Word:
157+
158+
def __init__(self, word):
159+
self.word = word
160+
self.usages = {}
161+
162+
def get_word_no_accent(self):
163+
return self.word.replace("́", '')
164+
165+
def add_definition(self, pos, definition):
166+
if pos in self.usages:
167+
u = self.usages[pos]
168+
else:
169+
u = Usage(self.word, pos)
170+
self.usages[pos] = u
171+
u.add_definition(definition)
172+
173+
def merge(self, other):
174+
for pos, usage in other:
175+
if pos in self.usages:
176+
self.usages[pos].merge(usage)
177+
else:
178+
self.usages[pos] = usage
179+
180+
181+
class Dictionary:
182+
183+
def __init__(self):
184+
self.dict = {}
185+
186+
def add_to_dictionary(self, to_add):
187+
if isinstance(to_add, Word):
188+
if to_add.word in self.dict:
189+
self.dict[to_add.word].merge(to_add)
190+
else:
191+
self.dict[to_add.word] = to_add
192+
if isinstance(to_add, list):
193+
for w in to_add:
194+
self.add_to_dictionary(w)
195+
196+
def to_dict(self):
197+
dict = {}
198+
for k, v in self.dict:
199+
dict[k] = v.to_dict()

‎etl/extract.py

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import bz2
2+
import os
3+
import requests
4+
5+
os.makedirs('data', exist_ok=True)
6+
7+
8+
def get_ontolex():
9+
session = requests.session()
10+
print('downloading latest ontolex data from dbnary')
11+
with session.get('http://kaiko.getalp.org/static/ontolex/latest/en_dbnary_ontolex.ttl.bz2', stream=True) as f:
12+
data = bz2.BZ2File(f.raw).read()
13+
print('decompressing')
14+
with open('data/raw_dbnary_dump.ttl', 'wb+') as f:
15+
f.write(data)
16+
print('decompressing finished')
17+
18+
19+
def get_lemmas():
20+
session = requests.session()
21+
22+
def add_words(words, results):
23+
for word in results['query']['categorymembers']:
24+
title = word['title']
25+
if 'Category' not in title:
26+
words.append(title)
27+
28+
words = []
29+
30+
results = session.get('https://en.wiktionary.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:Ukrainian_lemmas&format=json&cmlimit=max').json()
31+
add_words(words, results)
32+
33+
while 'continue' in results:
34+
cmcontinue = results['continue']
35+
results = session.get(f'https://en.wiktionary.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:Ukrainian_lemmas&format=json&cmlimit=max&cmcontinue={cmcontinue}').json()
36+
add_words(words, results)
37+
38+
return words

‎etl/main.py

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from dictionary import Ontolex
2+
3+
o = Ontolex(read='ontolex_data.json')
4+
o.dump('ontolex_data.json', indent=2)
5+
d = o.get_dictionary()
6+
print(d)

0 commit comments

Comments
 (0)
Please sign in to comment.