-
Notifications
You must be signed in to change notification settings - Fork 0
/
helpers.py
71 lines (62 loc) · 1.54 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""
Miscellaneous helper functins.
"""
import os
import conf
import copy
import cyrconv
crconv = cyrconv.CirConv()
def fsize(path):
"""
Return file size in MB.
"""
return round(os.path.getsize(path) / (1024*1000),2)
def sort_dictionary(d):
"""
Sort items in dictiounary.
"""
print("SORTING", d)
for i in d:
d[i] = d[i].sort()
return d
def compile_sources():
"""
Compile sources for corpus.
"""
template = "\n\nSource id:%s\n\n%s"
text = []
src = copy.deepcopy(conf.corpus_sources)
if not conf.SETUSECRO:
# Croatian dictionary not used
del src['HR.Txt']
for s in src:
#print(s)
#print(src[s])
text.append(template % (s, src[s]))
return ''.join(text)
def cyrilic_check_convert(word):
"""
Check/convert the word to Cyrillic script.
Returns False is word cannot be converted to
Cyrillic script, otherwise returns the converted
form of the word.
"""
# Is all text in this word Cyrillic?
# If yes, just return the word.
allcyr = crconv.is_all_cyrillic(word)
if allcyr:
return word
# What to do if not?
else:
# First, check if the word contains only letters
# of the Serbian alphabet (no x, y, q)
if crconv.is_all_latin(word):
# It's safe to convert it to Cyrilic.
return crconv.convert(word)
else:
return False
def perc(x, y, r=2):
"""
Return percentage of x in y.
"""
return round(100 * float(x)/float(y), r)