|
| 1 | +import extract |
| 2 | +import json |
| 3 | + |
| 4 | + |
| 5 | +class Ontolex_Word: |
| 6 | + |
| 7 | + def __init__(self, word, data=None): |
| 8 | + self.word = word |
| 9 | + if data: |
| 10 | + self.data = data |
| 11 | + else: |
| 12 | + self.data = {} |
| 13 | + |
| 14 | + def add_gloss(self, gloss, part_of_speech, can_exist=False): |
| 15 | + definition, translation = None, [] |
| 16 | + if gloss in self.data: |
| 17 | + definition, translation = self.data[gloss]['def'], self.data[gloss]['translation'] |
| 18 | + if can_exist or gloss not in self.data: |
| 19 | + self.data[gloss] = { |
| 20 | + 'pos': part_of_speech, |
| 21 | + 'def': definition, |
| 22 | + 'translation': translation |
| 23 | + } |
| 24 | + |
| 25 | + def add_translation(self, gloss, translation): |
| 26 | + if translation.endswith(' f') or translation.endswith(' m'): |
| 27 | + translation = translation[:-3] |
| 28 | + self.data[gloss]['translation'].append(translation) |
| 29 | + |
| 30 | + def add_definition(self, gloss, definition): |
| 31 | + self.data[gloss]['def'] = definition |
| 32 | + |
| 33 | + def get_translations(self): |
| 34 | + results = [] |
| 35 | + for _, gloss_data in self.data.items(): |
| 36 | + pos, definition, translations = gloss_data['pos'], gloss_data['def'], gloss_data['translation'] |
| 37 | + for t in translations: |
| 38 | + if not definition: |
| 39 | + definition = self.word |
| 40 | + if self.word not in definition: |
| 41 | + definition = f"{self.word}, {definition}" |
| 42 | + if t in results: |
| 43 | + w = results[t] |
| 44 | + else: |
| 45 | + w = Word(t) |
| 46 | + w.add_definition(pos, definition) |
| 47 | + results.append(w) |
| 48 | + return results |
| 49 | + |
| 50 | + def get_dict(self): |
| 51 | + return self.data |
| 52 | + |
| 53 | + |
| 54 | +class Ontolex: |
| 55 | + |
| 56 | + def __init__(self, get_data=False, read=None): |
| 57 | + if get_data: |
| 58 | + extract.get_ontolex() |
| 59 | + self.words = {} |
| 60 | + if read: |
| 61 | + with open(f"data/{read}", 'r', encoding='utf-8') as f: |
| 62 | + data = json.loads(f.read()) |
| 63 | + for w, o_w in data.items(): |
| 64 | + self.words[w] = Ontolex_Word(w, o_w) |
| 65 | + else: |
| 66 | + self.parse_ontolex() |
| 67 | + |
| 68 | + def get_word(self, word): |
| 69 | + if word not in self.words: |
| 70 | + self.words[word] = Ontolex_Word(word) |
| 71 | + return self.words[word] |
| 72 | + |
| 73 | + def parse_ontolex(self): |
| 74 | + print('parsing ontolex data') |
| 75 | + with open('data/raw_dbnary_dump.ttl', 'r', encoding='utf-8-sig') as f: |
| 76 | + data = f.read().split('\n') |
| 77 | + n = len(data) |
| 78 | + divisor = 10 ** 6 |
| 79 | + for i, line in enumerate(data): |
| 80 | + if i % divisor == 0: |
| 81 | + print(f"{i // divisor} of {n // divisor}") |
| 82 | + if 'eng:__en_gloss' in line: |
| 83 | + gloss = line.split(';')[0].split('>')[0].split('/')[-1].split('.')[0].split(':')[-1].strip() |
| 84 | + vals = [x.replace('_', ' ').strip() for x in '_'.join(gloss.split('_')[5:]).split('__')] |
| 85 | + word = vals[0] |
| 86 | + new_word = word |
| 87 | + part_of_speech = vals[1] if len(vals) > 1 else None |
| 88 | + self.get_word(word).add_gloss(gloss, part_of_speech) |
| 89 | + if 'dbnary:isTranslationOf' in line: |
| 90 | + translation = line.split(';')[0].split('>')[0].split('/')[-1].split('.')[0].split(':')[-1].strip().replace('__en_gloss', '') |
| 91 | + vals = [x.replace('_', ' ').strip() for x in translation.split('__')] |
| 92 | + new_word = vals[0] |
| 93 | + if new_word == word: |
| 94 | + part_of_speech = vals[1] if len(vals) > 1 else None |
| 95 | + self.get_word(word).add_gloss(gloss, part_of_speech) |
| 96 | + if '@uk' in line: |
| 97 | + translation = line.split('@')[0].replace('\\\"', '*').split("\"")[1].replace('*', '\\\"').replace('[','').replace(']','') |
| 98 | + translation = " ".join([x.split('|')[0] for x in translation.split(' ')]) |
| 99 | + if new_word == word: |
| 100 | + self.get_word(word).add_translation(gloss, translation) |
| 101 | + if 'rdf:value' in line and "@en" in line and '[' not in line: |
| 102 | + definition = line.split('@')[0].replace('\\\"', '*').split("\"")[1].replace('*', '\\\"') |
| 103 | + if new_word == word: |
| 104 | + self.get_word(word).add_definition(gloss, definition) |
| 105 | + print('parsing complete') |
| 106 | + |
| 107 | + |
| 108 | + def get_dictionary(self): |
| 109 | + dict = Dictionary() |
| 110 | + for _, word in self.words.items(): |
| 111 | + translations = word.get_translations() |
| 112 | + dict.add_to_dictionary(translations) |
| 113 | + return dict |
| 114 | + |
| 115 | + def get_dict(self): |
| 116 | + d = {} |
| 117 | + for w in self.words: |
| 118 | + d[w] = self.words[w].get_dict() |
| 119 | + return d |
| 120 | + |
| 121 | + def dump(self, loc, indent=None): |
| 122 | + with open(f'data/{loc}', 'w+', encoding='utf-8') as f: |
| 123 | + if indent: |
| 124 | + f.write( |
| 125 | + json.dumps(self.get_dict(), indent=indent, ensure_ascii=False) |
| 126 | + ) |
| 127 | + else: |
| 128 | + f.write( |
| 129 | + json.dumps(self.get_dict(), ensure_ascii=False) |
| 130 | + ) |
| 131 | + |
| 132 | + |
| 133 | +class Usage: |
| 134 | + |
| 135 | + def __init__(self, word, pos): |
| 136 | + self.word = word |
| 137 | + self.pos = pos |
| 138 | + self.definitions = {} |
| 139 | + |
| 140 | + def add_definitions(self, definitions): |
| 141 | + for d in definitions: |
| 142 | + self.add_definition(d) |
| 143 | + |
| 144 | + def add_definition(self, definition): |
| 145 | + self.definitions[definition] = None |
| 146 | + |
| 147 | + def get_definitions(self): |
| 148 | + return list(self.definitions) |
| 149 | + |
| 150 | + def merge(self, other): |
| 151 | + self.add_definitions( |
| 152 | + [item for pair in zip(self.get_definitions(), other.get_definitions()) for item in pair] |
| 153 | + ) |
| 154 | + |
| 155 | + |
| 156 | +class Word: |
| 157 | + |
| 158 | + def __init__(self, word): |
| 159 | + self.word = word |
| 160 | + self.usages = {} |
| 161 | + |
| 162 | + def get_word_no_accent(self): |
| 163 | + return self.word.replace("́", '') |
| 164 | + |
| 165 | + def add_definition(self, pos, definition): |
| 166 | + if pos in self.usages: |
| 167 | + u = self.usages[pos] |
| 168 | + else: |
| 169 | + u = Usage(self.word, pos) |
| 170 | + self.usages[pos] = u |
| 171 | + u.add_definition(definition) |
| 172 | + |
| 173 | + def merge(self, other): |
| 174 | + for pos, usage in other: |
| 175 | + if pos in self.usages: |
| 176 | + self.usages[pos].merge(usage) |
| 177 | + else: |
| 178 | + self.usages[pos] = usage |
| 179 | + |
| 180 | + |
| 181 | +class Dictionary: |
| 182 | + |
| 183 | + def __init__(self): |
| 184 | + self.dict = {} |
| 185 | + |
| 186 | + def add_to_dictionary(self, to_add): |
| 187 | + if isinstance(to_add, Word): |
| 188 | + if to_add.word in self.dict: |
| 189 | + self.dict[to_add.word].merge(to_add) |
| 190 | + else: |
| 191 | + self.dict[to_add.word] = to_add |
| 192 | + if isinstance(to_add, list): |
| 193 | + for w in to_add: |
| 194 | + self.add_to_dictionary(w) |
| 195 | + |
| 196 | + def to_dict(self): |
| 197 | + dict = {} |
| 198 | + for k, v in self.dict: |
| 199 | + dict[k] = v.to_dict() |
0 commit comments