forked from flipkart-incubator/optimus
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
100 lines (90 loc) · 4.19 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import csv
import os
import sys
import cPickle
from cnn_text_trainer.rw import datasets
__author__ = 'devashish.shankar'
#TODO clean up this. Move to core maybe?
def evaluate(data,outputf):
"""
Ported from initial version. TODO refactor to accept new format of data and clean up this code
:param data: array containing outputs in old format: [[prob_pred,pred_label,actual_label,text],...]
:param outputf: output directory
"""
filept=open(outputf+"/info_"+testfile.split("/")[-1].split(".")[0]+"_"+modelfile.split("/")[-1].split(".")[0]+".csv", "wb")
filep=csv.writer(filept)
filep.writerow(["Number of data-points ",len(data)])
print "Number of data-points: "+str(len(data))
filep.writerow(["Number of labels ",len(labels)])
print "Number of labels: "+str(len(labels))
perf=float(len([row[1] for row in data if row[1]==row[2]]))/float(len(data))
filep.writerow(["Accuracy ",str(perf*100)+"%"])
filep.writerow([])
print "Performance: "+str(perf*100)+"%\n"
y_pred=[row[1] for row in data]
y_true=[row[2] for row in data]
for n in labels:
tp=float(sum([(y_true[i]==n) and (y_pred[i]==n) for i in range(len(y_true))]))
tn=float(sum([(y_true[i]!=n) and (y_pred[i]!=n) for i in range(len(y_true))]))
fp=float(sum([(y_true[i]!=n) and (y_pred[i]==n) for i in range(len(y_true))]))
fn=float(sum([(y_true[i]==n) and (y_pred[i]!=n) for i in range(len(y_true))]))
fscore=(200*tp)/(2*tp+fp+fn)
filep.writerow(["Label ",n])
filep.writerow(["F-score ",str(fscore)+"%"])
filep.writerow(["TP ",int(tp),"FP ",int(fp),"TN ",int(tn),"FN ",int(fn)])
filep.writerow([])
print "F-score for label-"+str(n)+" is: "+str(fscore)+"%"
filept.close()
print "Printing output file"
with open(outputf+"/output_"+testfile.split("/")[-1].split(".")[0]+"_"+modelfile.split("/")[-1].split(".")[0]+".csv", "wb") as f:
writer = csv.writer(f)
writer.writerow(["probabilities","y_predicted","y_actual","tweets"])
for line in data:
writer.writerow(line)
print "Printing misclassification file"
with open(outputf+"/misclassification_"+testfile.split("/")[-1].split(".")[0]+"_"+modelfile.split("/")[-1].split(".")[0]+".csv", "wb") as f:
writer = csv.writer(f)
writer.writerow(["probabilities","y_predicted","y_actual","tweets"])
for line in data:
if line[1]!=line[2]:
writer.writerow(line)
if __name__=="__main__":
if len(sys.argv)<6:
print "Usage: testing.py"
print "\t<model file path>"
print "\t<testing file path>"
print "\t<folder to store detailed output analysis>"
print "\t<true/false preprocess>"
print "\t<load word vectors? (true/false). This will give accuracy gains, but will have a lot of memory pressure. If false, words not encountered during training are skipped while testing >"
exit(0)
testfile=sys.argv[2]
modelfile=sys.argv[1]
outputdir=sys.argv[3]
preprocess=sys.argv[4].lower()
load_word_vecs = sys.argv[5].lower()=="true"
if not os.path.exists(outputdir):
print "Output dir ",outputdir, " doesn't exist. Creating it"
os.makedirs(outputdir)
else:
print "Using Output dir ",outputdir,". Any previous results in this dir on same dataset might get overwritten. "
model = cPickle.load(open(modelfile,"rb"))
if load_word_vecs:
print "Loading word vectors"
model.add_global_word_vecs({})
print "Loading word vectors done"
sentences,vocab, labels = datasets.build_data(testfile,preprocess)
labels = model.get_labels()
output = model.classify(sentences)
#Free memory
del model
print "Removed model from memory"
#Format the output to earlier format
#TODO evaluate function should be changed to accept newer format, which is cleaner
data = []
for i in range(len(output[0])):
actual_label = sentences[i]['y']
text = sentences[i]['text']
predicted_label = output[0][i]
predicted_prob = output[1][i][predicted_label]
data.append([predicted_prob,labels[predicted_label],labels[actual_label],text])
evaluate(data,outputdir)