-
Notifications
You must be signed in to change notification settings - Fork 6
/
triple_sentence_selection.py
126 lines (108 loc) · 3.96 KB
/
triple_sentence_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import jsonlines
import codecs
import json
import numpy as np
from sklearn.externals import joblib
from proof_extraction_train import _extract_features
from defacto.model_nl import ModelNL
import unicodedata as ud
wiki_split_docs_dir = "../wiki-pages-split"
document_results_file = "data/dev_relevant_docs.jsonl" # file with tfidf only
# document_results_file = "data/dev_concatenation.jsonl" # file with tfidf and ner predicted_sentences
document_results_file_oie = "data/dev_concatenation_oie.jsonl" # file with tfidf and oie
document_results_file = jsonlines.open(document_results_file)
document_results_file_oie = jsonlines.open(document_results_file_oie)
relevant_sent_file = "data/dev_concatenation_oie_sentenceasdsadsadsadasdsadas.jsonl"
defacto_clf = joblib.load('defacto/defacto_models/rfc.mod')
def get_file(_doc):
try:
_doc = ud.normalize('NFC', _doc)
file = codecs.open(wiki_split_docs_dir + "/" + _doc + ".json", "r", "latin-1")
file = json.load(file)
return file
except Exception as _e:
print("Failed Loading" + str(_doc) + str(_e))
return ""
def get_lines(file):
full_lines = file["lines"]
lines = []
for _line in full_lines:
lines.append(_line['content'])
return lines
def get_pairs_from_doc(doc):
file = get_file(doc)
if file == "":
return ""
lines = get_lines(file)
_pairs = []
for i in range(len(lines)):
if lines[i] != "":
_pairs.append((doc, i))
return _pairs
def get_sentence(doc, line_num):
file = get_file(doc)
if file == "":
return ""
lines = get_lines(file)
_sentence = lines[line_num]
return _sentence
claims = []
for line in document_results_file:
claims.append(line)
claims_oie = []
for line in document_results_file_oie:
claims_oie.append(line)
errors = 0
correct = 0
no_prediction = 0
if __name__ == "__main__":
with jsonlines.open(relevant_sent_file, mode='w') as writer_c:
for line in claims_oie:
correct_sentences = set()
flag = False
try:
defactoModel = None
all_pairs = line['predicted_sentences']
all_pairs = [tuple(l) for l in all_pairs]
if 'predicted_pages_oie' in line:
documents = line['predicted_pages_oie']
for doc in documents:
pairs = get_pairs_from_doc(doc)
all_pairs.extend(pairs)
all_pairs = list(set(all_pairs))
for pair in all_pairs:
if defactoModel is None:
defactoModel = ModelNL(claim=line['claim'])
sentence = get_sentence(pair[0], pair[1])
if sentence == "":
continue
try:
x = _extract_features(sentence, line['claim'], defactoModel.triples)
x = np.asarray(x)
x = x.reshape(1, -1)
y = defacto_clf.predict(x)
defacto_class = y[0]
except Exception as e:
errors += 1
print("Error: " + str(errors))
if defacto_class == 0:
continue
else:
correct_sentences.add(pair)
except Exception as e:
print("Error")
flag = True
correct += 1
print(correct)
if len(correct_sentences) > 0:
correct_sentences = list(correct_sentences)
line['predicted_sentences_triple'] = correct_sentences
else:
print("NO PREDICTION!!!!")
if flag:
no_prediction += 1
line['predicted_sentences_triple'] = all_pairs
writer_c.write(line)
print(no_prediction)
print(correct)
print(no_prediction)