-
Notifications
You must be signed in to change notification settings - Fork 0
/
kaggle.py
250 lines (218 loc) · 8.5 KB
/
kaggle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# -*- coding: utf-8 -*-
from __future__ import division, print_function
from gensim.models import Word2Vec
from keras.models import model_from_json
from keras.preprocessing.sequence import pad_sequences
import nltk
import numpy as np
import collections
import os
import random
import json
from my_w2v_tokenize import *
def get_stories(story_file, debug=False):
stories = []
lno = 0
fin = open(story_file, "rb")
for line in fin:
if debug == True and lno % 100 == 0:
print("# stories read: %d" % (lno))
line = line.strip()
line = line.decode("utf8").encode("ascii", "ignore")
fcid, sent, ans = line.split("\t")
stories.append(nltk.word_tokenize(" ".join([sent, ans])))
lno += 1
fin.close()
return stories
def get_question_answer_pairs_old(question_file, is_test=False):
qapairs = []
fqa = open(question_file, "rb")
for line in fqa:
if line.startswith("#"):
continue
line = line.strip().decode("utf8").encode("ascii", "ignore")
cols = line.split("\t")
question = cols[1]
qwords = nltk.word_tokenize(question)
if not is_test:
correct_ans = cols[2]
answers = cols[3:]
# training file parsing
correct_ans_idx = ord(correct_ans) - ord('A')
for idx, answer in enumerate(answers):
awords = nltk.word_tokenize(answer)
qapairs.append((qwords, awords, idx == correct_ans_idx))
else:
# test file parsing (no correct answer)
answers = cols[2:]
for answer in answers:
awords = nltk.word_tokenize(answer)
qapairs.append((qwords, awords, None))
fqa.close()
return qapairs
def get_question_answer_pairs(question_file, is_test=False):
qapairs = []
fqa = open(question_file, "r")
data = json.load(fqa)
for l, line in enumerate(data):
if l%100==0:
print(l)
question = line["question"]+" "+line["support"]
qwords = tokenizer(question)
#qwords = nltk.word_tokenize(question)
if len(qwords)>100:
qwords=qwords[:100]
if not is_test:
correct_ans = line["correct_answer"],
answers = [line["distractor1"],line["distractor2"],line["distractor3"],correct_ans[0]]
new_order = [0,1,2,3]
random.shuffle(new_order)
answers = [ answers[i] for i in new_order]
correct_ans_idx = new_order[-1]
# training file parsing
#correct_ans_idx = ord(correct_ans) - ord('A')
for idx, answer in enumerate(answers):
#awords = nltk.word_tokenize(answer)
#print(answer)
awords = tokenizer(answer)
qapairs.append((qwords, awords, idx == correct_ans_idx))
else:
# test file parsing (no correct answer)
answers = cols[2:]
for answer in answers:
awords = nltk.word_tokenize(answer)
qapairs.append((qwords, awords, None))
fqa.close()
return qapairs
def get_story_question_answer_triples(sqa_file):
sqatriples = []
fsqa = open(sqa_file, "rb")
for line in fsqa:
line = line.strip().decode("utf8").encode("ascii", "ignore")
if line.startswith("#"):
continue
story, question, answer, correct = line.split("\t")
swords = []
story_sents = nltk.sent_tokenize(story)
for story_sent in story_sents:
swords.extend(nltk.word_tokenize(story_sent))
qwords = nltk.word_tokenize(question)
awords = nltk.word_tokenize(answer)
is_correct = int(correct) == 1
sqatriples.append((swords, qwords, awords, is_correct))
fsqa.close()
return sqatriples
def build_vocab(stories, qapairs, testqs):
wordcounts = collections.Counter()
for story in stories:
for sword in story:
wordcounts[sword] += 1
for qapair in qapairs:
for qword in qapair[0]:
wordcounts[qword] += 1
for aword in qapair[1]:
wordcounts[aword] += 1
for testq in testqs:
for qword in testq[0]:
wordcounts[qword] += 1
for aword in testq[1]:
wordcounts[aword] += 1
words = [wordcount[0] for wordcount in wordcounts.most_common()]
word2idx = {w: i+1 for i, w in enumerate(words)} # 0 = mask
return word2idx
def build_vocab_from_sqa_triples(sqatriples):
wordcounts = collections.Counter()
for sqatriple in sqatriples:
for sword in sqatriple[0]:
wordcounts[sword] += 1
for qword in sqatriple[1]:
wordcounts[qword] += 1
for aword in sqatriple[2]:
wordcounts[aword] += 1
words = [wordcount[0] for wordcount in wordcounts.most_common()]
word2idx = {w: i+1 for i, w in enumerate(words)} # 0 = mask
return word2idx
def vectorize_stories(stories, word2idx, story_maxlen):
Xs = []
for story in stories:
Xs.append([word2idx[word] for word in story])
return pad_sequences(Xs, maxlen=story_maxlen)
def vectorize_qapairs(qapairs, word2idx, seq_maxlen):
Xq, Xa, Y = [], [], []
for qapair in qapairs:
Xq.append([word2idx[qword] for qword in qapair[0]])
Xa.append([word2idx[aword] for aword in qapair[1]])
#Y.append(np.array([1, 0]) if qapair[2] else np.array([0, 1]))
# cosine dsistance: 0 closest, 1 farest
Y.append(np.array([0]) if qapair[2] else np.array([1]))
return (pad_sequences(Xq, maxlen=seq_maxlen),
pad_sequences(Xa, maxlen=seq_maxlen),
np.array(Y))
def vectorize_sqatriples(sqatriples, word2idx, story_maxlen,
question_maxlen, answer_maxlen):
Xs, Xq, Xa, Y = [], [], [], []
for sqatriple in sqatriples:
Xs.append([word2idx[sword] for sword in sqatriple[0]])
Xq.append([word2idx[qword] for qword in sqatriple[1]])
Xa.append([word2idx[aword] for aword in sqatriple[2]])
Y.append(np.array([1, 0]) if sqatriple[3] else np.array([0, 1]))
return (pad_sequences(Xs, maxlen=story_maxlen),
pad_sequences(Xq, maxlen=question_maxlen),
pad_sequences(Xa, maxlen=answer_maxlen),
np.array(Y))
def get_weights_word2vec(word2idx, w2vfile, w2v_embed_size=300,
is_custom=False):
word2vec = None
if is_custom:
word2vec = Word2Vec.load(w2vfile)
else:
word2vec = Word2Vec.load_word2vec_format(w2vfile, binary=True)
vocab_size = len(word2idx) + 1
embedding_weights = np.zeros((vocab_size, w2v_embed_size))
for word, index in word2idx.items():
try:
embedding_weights[index, :] = word2vec[word.lower()]
except KeyError:
pass # keep as zero (not ideal, but what else can we do?)
return embedding_weights
def get_model_filename(caller, model_type):
caller = os.path.basename(caller)
caller = caller[0:caller.rindex(".")]
if model_type == "json":
return "%s.%s" % (caller, model_type)
else:
return "%s-%s.h5" % (caller, model_type)
def save_model(model, json_filename, weights_filename):
model.save_weights(weights_filename)
with open(json_filename, "wb") as fjson:
fjson.write(model.to_json())
def load_model(json_filename, weights_filename):
with open(json_filename, "rb") as fjson:
model = model_from_json(fjson.read())
model.load_weights(filepath=weights_filename)
return model
##### main ####
#
#import os
#
#DATA_DIR = "../data/comp_data"
#QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"
#STORY_FILE = "studystack_qa_cleaner_no_qm.txt"
#
#stories = get_stories(os.path.join(DATA_DIR, STORY_FILE))
#story_maxlen = max([len(words) for words in stories])
#print("story maxlen=", story_maxlen)
#
#qapairs = get_question_answer_pairs(os.path.join(DATA_DIR, QA_TRAIN_FILE))
#question_maxlen = max([len(qapair[0]) for qapair in qapairs])
#answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
#print("q=", question_maxlen, "a=", answer_maxlen)
#
#word2idx = build_vocab(stories, qapairs)
#w2v = get_weights_word2vec(word2idx,
# os.path.join(DATA_DIR, "studystack.bin"),
# is_custom=True)
#print(w2v.shape)
#
#Xs = vectorize_stories(stories, word2idx, story_maxlen)
#Xq, Xa = vectorize_qapairs(qapairs, word2idx, question_maxlen, answer_maxlen)