-
Notifications
You must be signed in to change notification settings - Fork 1
/
q5.py
139 lines (126 loc) · 4.16 KB
/
q5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import sys
import pickle
import collections
from collections import defaultdict
def main(corpus_en,corpus_de):
#initialize the q(j|i,l,m)
with open('en_dict', 'rb') as handle:
en_dict = pickle.load(handle)
en_corpus = open (corpus_en,"r")
de_corpus = open (corpus_de,"r")
q = defaultdict(dict)
for en_line in en_corpus:
de_line = de_corpus.readline()
en_words = en_line.split()
de_words = de_line.split()
l = len(en_words)
m = len(de_words)
for j in range(0,l+1):
for i in range(1,m+1):
q[(i,l,m)][j] = 1.0/(l+1)
en_corpus.close()
de_corpus.close()
#here comes the em algorithm, returning en_dict and q
#initialize the counts
en_counts = defaultdict(int)
en_de_counts = defaultdict(int)
j_ilm_counts = defaultdict(int)
ilm_counts = defaultdict(int)
en_dict_2, q_2 = ibm2_em(en_dict,q,corpus_en,corpus_de,en_counts,en_de_counts,j_ilm_counts,ilm_counts)
en_corpus = open(corpus_en,"r")
de_corpus = open(corpus_de,"r")
with open("en_dict_2","wb") as handle:
pickle.dump(en_dict_2, handle)
with open("q_2","wb") as handle:
pickle.dump(q_2,handle)
alignment_20_sen = open("alignment_20_sen_ibm2.txt","w")
for i in range(0,20):
en_line = en_corpus.readline()
de_line = de_corpus.readline()
optimum_alignment = alignment(en_line,de_line,en_dict_2,q_2)
alignment_20_sen.write("The" + str(i+1) + "sentence's alignment is " + str(optimum_alignment))
alignment_20_sen.write("\n")
alignment_20_sen.close()
en_corpus.close()
de_corpus.close()
def ibm2_em(en_dict,q,corpus_en,corpus_de,en_counts,en_de_counts,j_ilm_counts,ilm_counts):
#interate for 5 times
for i in range(0,5):
print "Iteration: " + str(i)
en_corpus = open(corpus_en,"r")
de_corpus = open(corpus_de,"r")
for en_line in en_corpus:
de_line = de_corpus.readline()
en_words = en_line.split()
de_words = de_line.split()
#start from NULL, update the total
l = len(en_words)
m = len(de_words)
i = 1
for de_word in de_words:
jilm_sum = q[(i,l,m)][0] * en_dict["NULL"][de_word]
j = 1
for en_word in en_words:
jilm_sum += q[(i,l,m)][j] * en_dict[en_word][de_word]
j += 1
j = 0
delta = q[(i,l,m)][j] * en_dict["NULL"][de_word]/float(jilm_sum)
en_counts["NULL"] += delta
en_de_counts[("NULL",de_word)] += delta
j_ilm_counts[(j,i,l,m)] += delta
ilm_counts[(i,l,m)] += delta
#print str(j_ilm_counts[(j,i,l,m)])
j += 1
for en_word in en_words:
delta = q[(i,l,m)][j] * en_dict[en_word][de_word]/float(jilm_sum)
#update delta here
en_counts[en_word] += delta
en_de_counts[(en_word,de_word)] += delta
j_ilm_counts[(j,i,l,m)] += delta
ilm_counts[(i,l,m)] += delta
j += 1
i += 1
en_corpus.close()
de_corpus.close()
for en_word in en_dict.iterkeys():
for de_word in en_dict[en_word].iterkeys():
en_dict[en_word][de_word] = float(en_de_counts[(en_word,de_word)])/en_counts[en_word]
#not so sure about list key
for ilm in q.iterkeys():
for j in q[ilm].iterkeys():
q[ilm][j] = float(j_ilm_counts[(j,ilm[0],ilm[1],ilm[2])])/ilm_counts[ilm]
#print str(j_ilm_counts[j,ilm[0],ilm[1],ilm[2]])
return en_dict,q
#calculate the alignment
def alignment(en_sentence, de_sentence, en_dict,q):
#align the correct order of english words for the de sentence
en_words = en_sentence.split()
de_words = de_sentence.split()
l = len(en_words)
m = len(de_words)
alignment = []
#iterate through all the german words first
i = 1
for de_word in de_words:
max_t = 0.0
arg_max_alignment = 0
#initialize the max_t with the NULL condition
if de_word in en_dict["NULL"]:
max_t = en_dict["NULL"][de_word] * q[(i,l,m)][0]
#else:
#if the de_word can not be translated into NULL, it can not use position 0
#max_t = 1
#use the en_cur_position variable to record the position of english sentence
en_cur_position = 1
for en_word in en_words:
if de_word in en_dict[en_word]:
temp_prob = en_dict[en_word][de_word] * q[(i,l,m)][en_cur_position]
if temp_prob > max_t:
max_t = temp_prob
arg_max_alignment = en_cur_position
en_cur_position += 1
alignment.append(arg_max_alignment)
i += 1
return alignment
if __name__ == "__main__":
main(sys.argv[1], sys.argv[2])