-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcognateLanguage_Translate.py
161 lines (126 loc) · 5.14 KB
/
cognateLanguage_Translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import string # example use here: to be able to remove punctuation
import re # example use here: to be able to remove repeating spaces
from dictAsFile_wrapper import *
def justTwoInitSylls_CVC(word):
beforeThisIndex = 0
for vowel1 in word:
if vowel1 in 'aeiou':
afterThisIndex = word.index(vowel1)
break
for vowel2 in word[afterThisIndex+1:]:
if vowel2 in 'aeiou':
beforeThisIndex = word[afterThisIndex+1:].index(vowel2)+1 + afterThisIndex+1
break
if beforeThisIndex!=0:
word = word[:beforeThisIndex+1]
return word
def countVowels(word):
vowels = 'aeiou'
word = word.lower()
count = 0
for char in word:
if char in vowels:
count += 1
return count
def indexOfNthInstanceOfVowel(mystr,n):
vowels = 'aeiou'
count = 0
for i,letter in enumerate(mystr):
if letter in vowels:
count += 1
if count == n:
return i
return None # in case cannot find
def vowelGroupCount(myStr):
vowels = 'aeiou'
count = 0
for i,letter in enumerate(myStr):
if i<len(myStr)-1 and myStr[i+1] == letter:
continue
if letter in vowels:
count += 1
return count
def isEven(num):
return num%2 == 0
def formatInput(input):
# remove punctuation from input, except for '?'
exclude = set(string.punctuation)
input = ''.join(ch for ch in input if (ch not in exclude or ch == '?'))
# add space before '?' to enable replacing with question particle word
input = input.replace('?',' ?')
# make input all lowercase
input = input.lower()
# remove repeating spaces from what remains
input = re.sub(' +', ' ', input)
# print (input) # debug output
return input
filename = 'hashtable.pkl' # 'output_shortlist.txt'
data = {}
input = ''
translation = '< Translation Not Found. >'
input = raw_input('Enter English word or sentence gloss to translate [and then hit Enter key]:\n\t')
input = formatInput(input)
if input != "":
translation = ''
shortTranslation = ''
trackLastLetterOfLastWord = ''
# get hashtable file into a dictionary
data = readFileToDict(filename)
# detect CogLang as input by checking if input is one abnormally long 'word' (and isn't found to be an English entry) and other indicators
if (' ' not in input and input not in data and len(input) >= 9 and isEven(vowelGroupCount(input))):
# split input into words by every 2nd vowel (and final consonant of sentence-word)
newInput = []
while len(input) > 1:
nextIndex = indexOfNthInstanceOfVowel(input,2)
if nextIndex != len(input)-2:
newInput.append(input[:nextIndex+1])
else:
newInput.append(input)
input = input[nextIndex+1:]
# get .txt file file into a dictionary
filename = 'output_shortlist.txt'
with open(filename,'r') as f:
data = f.readlines()
data = [line.strip() for line in data]
for word in newInput:
translationFound = False
# search list for reverse word translation to English
for line in data:
if line != '\n' and ',' in line:
if word == line[0:len(word)]:
translatedWord = line.split(',')[1]
if translatedWord != '?':
shortTranslatedWord = justTwoInitSylls_CVC(translatedWord)
numVowelsInTranslatedWord = countVowels(translatedWord)
translation += translatedWord + ' '
else:
translation = translation[:-1] + translatedWord + '?'
translationFound = True
# add in '?' for words not found
if translationFound == False:
translation += '[?]' + ' '
shortTranslation = "(N/A for English.)"
else: # otherwise English sentence detected --> translate to Coglang
# split input into words
input = input.split(' ')
for word in input:
translationFound = False
# # search for word translation in list
# account for plural nouns or 2nd person singular verbs
if word not in data and word[-1] == 's' and word[:-1] in data:
word = word[:-1]
# search for word translation in data ("data" is a hashtable/dictionary)
if word in data:
translatedWord = data[word]
shortTranslatedWord = justTwoInitSylls_CVC(translatedWord)
numVowelsInTranslatedWord = countVowels(translatedWord)
shortTranslation += ' ' + shortTranslatedWord
translation += translatedWord + ' '
translationFound = True
# add in '?' for words not found
if translationFound == False:
translation += '[?]' + ' '
# remove final space ' '
translation = translation[:-1]
print ('Long Translation:\n\t' + '"' + translation.capitalize()+'.' + '"')
print ('Short Translation:\n\t' + shortTranslation)