-
Notifications
You must be signed in to change notification settings - Fork 0
/
align_from_phrases.py
executable file
·212 lines (190 loc) · 8.74 KB
/
align_from_phrases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/usr/bin/python
import argparse
import sys, os
import codecs
from collections import namedtuple
import align_utils
phrase = namedtuple("phrase", "english, prob")
def setup_parser():
parser = argparse.ArgumentParser(description='Create alignment matrix based on a phrase table.')
parser.add_argument('-s', '--source', dest='source_filename',
required=True,
help='The source language version of paired f-e text.')
parser.add_argument('-t', '--target', dest='target_filename',
required=True,
help='The target language version of paired f-e text.')
parser.add_argument('-p', '--phrase-table', dest='phrase_table_filename',
required=True,
help='The text for the phrase table, with scoring.')
parser.add_argument('-a', '--alignments', dest='alignment_filename',
required=True,
help='The text for the old alignments.')
parser.add_argument('-l', '--max-phrase-len', dest='max_phrase_len',
type=int, default=4,
help='Maximum phrase length allowed')
parser.add_argument('-n', '--num-phrases', dest='num_phrases',
type=int, default=sys.maxint,
help='Maximum number of phrases allowed, truncated based on phrase probability')
parser.add_argument('-v', '--verbose', dest='verbose',
action='store_true', default=False,
help='Verbose mode (default=off)')
opts = parser.parse_args()
return opts
def main():
opts = setup_parser()
phrase_table = TM(opts.phrase_table_filename,
opts.max_phrase_len,
opts.num_phrases)
max_sent_len = 25
num_phrase_aligned = 0
num_not_aligned = 0
num_fast_aligned = 0
sf = codecs.open(opts.source_filename, 'r', 'utf-8')
tf = codecs.open(opts.target_filename, 'r', 'utf-8')
af = codecs.open(opts.alignment_filename, 'r', 'utf-8')
sline = sf.readline()
tline = tf.readline()
aline = af.readline()
while sline and tline and aline:
swords = sline.strip().split()
twords = tline.strip().split()
# We convert the string for the alignment pairs into a list of tuples
# for alignment pairs.
align_pairs = align_utils.str_to_alignment(aline)
# We only do the optimal phrase based aligner if the sentence is under a
# given length, due to performance concerns.
if len(swords) <= max_sent_len:
alignment = phrase_alignment(swords, twords,
phrase_table,
opts.num_phrases)
if (alignment):
num_phrase_aligned += 1
# Alignment purely on phrases failed, so for now we just fall back
# to the traditional GIZA++ based word alignment matrix.
else:
num_not_aligned += 1
alignment = align_pairs
# Sentence is too long, so it will be too slow to use the standard
# phrase-based aligner.
else:
num_fast_aligned += 1
alignment = align_pairs
print align_utils.alignment_to_str(alignment)
# Get next lines and repeat
sline = sf.readline()
tline = tf.readline()
aline = af.readline()
if (sline):
sys.stderr.write('Error: source sentence file was longer than'
+ ' cooresponding target sentence file')
if (tline):
sys.stderr.write('Error: target sentence file was longer than'
+ ' cooresponding source sentence file')
sys.stderr.write("NUMBER of correctly aligned = %d\n" % num_phrase_aligned)
sys.stderr.write("NUMBER unable to align = %d\n" % num_not_aligned)
sys.stderr.write("NUMBER too long, so fast aligned = %d\n" % num_fast_aligned)
def coverage(sequence):
# Generate a coverage for a sequence of indexes #
# You can do things like:
# c1 | c2 to "add" coverages
# c1 & c2 will return 0 if c1 and c2 do NOT overlap
# c1 & c2 will be != 0 if c1 and c2 DO overlap
return reduce(lambda x,y: x|y, map(lambda i: long(1) << i, sequence), 0)
# Attempts to fully align the source sentence with the target sentence based on
# the phrase table in the translation model.
# Argument descriptions:
# f: the tokenized source sentence
# e: the tokenized target sentence
# tm: the translation model and phrase table
# k: the maximum number of phrases translations available we allow for each
# target phrase
def phrase_alignment(f, e, tm, k):
f = tuple(map(lambda s: s.lower(), f))
e = tuple(map(lambda s: s.lower(), e))
# alignments[i] is a list of all the phrases in f that could have
# generated phrases starting at position i in e
alignments = [[] for _ in e]
for fi in xrange(len(f)):
for fj in xrange(fi+1,len(f)+1):
if f[fi:fj] in tm:
for phrase in tm[f[fi:fj]]:
ephrase = tuple(phrase.english.split())
for ei in xrange(len(e)+1-len(ephrase)):
ej = ei+len(ephrase)
if ephrase == e[ei:ej]:
alignments[ei].append((ej, phrase.prob, fi, fj))
# Sort each possible phrase translation by descending probability.
for i, alignment in enumerate(alignments):
alignment.sort(key=lambda x: -x[1])
alignments[i] = alignment[:k]
# Compute sum of probability of all possible alignments by dynamic programming.
# To do this, recursively compute the sum over all possible alignments for each
# each pair of English prefix (indexed by ei) and French coverage (indexed by
# coverage v), working upwards from the base case (ei=0, v=0) [i.e. forward chaining].
# The final sum is the one obtained for the pair (ei=len(e), v=range(len(f))
sum_data = namedtuple("sum_data", "prob, alignments")
chart = [{} for _ in e] + [{}]
chart[0][0] = sum_data(0.0, [])
for ei, sums in enumerate(chart[:-1]):
for v in sums:
for ej, prob, fi, fj in alignments[ei]:
if coverage(range(fi,fj)) & v == 0:
new_v = coverage(range(fi,fj)) | v
next_prob = sums[v].prob * prob
old_alignments = sums[v].alignments
# Append the newest phrase alignment to the list of alignments.
new_alignments = old_alignments + [((fi, fj), (ei, ej))]
if new_v in chart[ej]:
if chart[ej][new_v].prob < next_prob:
chart[ej][new_v] = sum_data(next_prob, new_alignments)
# chart[ej][new_v] = logadd(chart[ej][new_v], sums[v]+logprob)
else:
chart[ej][new_v] = sum_data(next_prob, new_alignments)
# chart[ej][new_v] = sums[v]+logprob
goal = coverage(range(len(f)))
if goal in chart[len(e)]:
return expand_phrase_alignment(chart[len(e)][goal].alignments)
else:
sys.stderr.write("ERROR: COULD NOT ALIGN SENTENCE\n")
return None
# We want alignments to be on a word-to-word basis, not phrase-to-phrase.
# Thus, we expand the phrase alignment so that every word in the source
# phrase aligns with every word in the target phrase.
def expand_phrase_alignment(alignments):
expanded = []
for alignment in alignments:
fi = alignment[0][0]
fj = alignment[0][1]
ei = alignment[1][0]
ej = alignment[1][1]
for fk in xrange(fi, fj):
for ek in xrange(ei, ej):
expanded.append((fk, ek))
return expanded
# A translation model is a dictionary where keys are tuples of French words
# and values are lists of (english, logprob) named tuples. For instance,
# the French phrase "que se est" has two translations, represented like so:
# tm[('que', 'se', 'est')] = [
# phrase(english='what has', logprob=-0.301030009985),
# phrase(english='what has been', logprob=-0.301030009985)]
# k is a pruning parameter: only the top k translations are kept for each f.
def TM(filename, max_len, k):
sys.stderr.write("Reading translation model from %s...\n" % (filename,))
tm = {}
with codecs.open(filename, 'r', 'utf-8') as pt_text:
for line in pt_text:
pt_entry = line.strip().split(" ||| ")
f = pt_entry[0].lower()
e = pt_entry[1].lower()
p = float(pt_entry[2].split()[0])
# We only record phrases under a given length
if (len(f.split()) <= max_len and len(e.split()) <= max_len):
tm.setdefault(tuple(f.split()), []).\
append(phrase(e, float(p)))
# prune all but the top k translations
for f in tm:
tm[f].sort(key=lambda x: -x.prob)
del tm[f][k:]
return tm
if (__name__ == '__main__'):
main()