-
Notifications
You must be signed in to change notification settings - Fork 0
/
translit_generator.py
69 lines (54 loc) · 2.61 KB
/
translit_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import random
from utils import *
import multiprocessing
from multiprocessing import Process, Lock
from multiprocessing.dummy import Pool as ThreadPool
class Translit(object):
def __init__(self,source_language, target_language):
self.source_language = source_language
self.target_language = target_language
self.translits = dict()
self._lock = Lock()
#Generate translits
def translit(self, source_language_corpus_path, destination_translit_corpus_path):
result = ''
self.destination_translit_corpus_path = destination_translit_corpus_path
#Get mapping
mapping = get_mapping(self.source_language, self.target_language)
#Read source corpus text
with codecs.open(source_language_corpus_path, 'r', encoding="utf-8") as corpus_file:
rows = corpus_file.read().split("\n")
#Init thread pool
pool = ThreadPool(multiprocessing.cpu_count())
per_batch_size = len(rows) // multiprocessing.cpu_count()
#Per thread text batch
to_be_processed = [(rows[i * per_batch_size:i * per_batch_size + per_batch_size], mapping, i) for i in range(multiprocessing.cpu_count())]
#Run threads
results = pool.map_async(self.translit_async, to_be_processed, callback=self.aprove_finish)
results.wait()
print("Translit generation finished")
#Generat translit from bach async
def translit_async(self, batch):
print("Aync translitaration started {}".format(batch[2]))
translit_rows = []
text_rows = batch[0]
batch_index = batch[2]
source_destination_mappings = batch[1]
try:
for text_row in text_rows:
translit_row = "".join([random.choice(source_destination_mappings[char]) if char in source_destination_mappings else char for char in text_row])
translit_row = translit_row.replace("\n"," ") + "\n"
translit_rows.extend(translit_row)
except Exception as ex:
print(ex)
with self._lock:
self.translits[batch_index] = "".join(translit_rows)
print("Aync translitaration finished {}".format(batch_index))
return 0
#Approve finish of generating and writing transliterations
def aprove_finish(self, status):
print("Aproove finish started")
with codecs.open(self.destination_translit_corpus_path, 'w', encoding="utf-8") as translit_file:
for i in range(multiprocessing.cpu_count()):
translit_file.write(self.translits[i])
print("Aproove finish finished")