forked from mitdbg/aurum-datadiscovery
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_semprop.py
209 lines (169 loc) · 8.81 KB
/
run_semprop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
from collections import defaultdict
from sys import argv
from inputoutput import inputoutput as io
from knowledgerepr import fieldnetwork
from modelstore.elasticstore import StoreHandler
from ontomatch import glove_api
from ontomatch import matcher_lib as matcherlib
from ontomatch.matcher_lib import MatchingType
from ontomatch.sem_prop_benchmarking import write_matchings_to, compute_pr_matchings, read
from ontomatch.ss_api import SSAPI
def generate_matchings(network, store_client, om, path_to_results):
l7_matchings = matcherlib.find_hierarchy_content_fuzzy(om.kr_handlers, store_client)
write_matchings_to(path_to_results + 'l7', l7_matchings)
l4_matchings_01 = matcherlib.find_relation_class_name_matchings(network, om.kr_handlers,
minhash_sim_threshold=0.1)
write_matchings_to(path_to_results + 'l4', l4_matchings_01)
l5_matchings_01 = matcherlib.find_relation_class_attr_name_matching(network, om.kr_handlers,
minhash_sim_threshold=0.1)
write_matchings_to(path_to_results + 'l5', l5_matchings_01)
l42_matchings_05, neg_l42_matchings_02 = matcherlib.find_relation_class_name_sem_matchings(network, om.kr_handlers,
sem_sim_threshold=0.5,
negative_signal_threshold=0.1,
add_exact_matches=False,
penalize_unknown_word=True)
write_matchings_to(path_to_results + 'l42', l42_matchings_05)
write_matchings_to(path_to_results + 'neg_l42', neg_l42_matchings_02)
l52_matchings_05, neg_l52_matchings_02 = matcherlib.find_relation_class_attr_name_sem_matchings(network, om.kr_handlers,
semantic_sim_threshold=0.5,
negative_signal_threshold=0.1,
add_exact_matches=False,
penalize_unknown_word=True)
write_matchings_to(path_to_results + 'l52', l52_matchings_05)
write_matchings_to(path_to_results + 'neg_l52', neg_l52_matchings_02)
l6_matchings_02_1, table_groups = matcherlib.find_sem_coh_matchings(network, om.kr_handlers,
sem_sim_threshold=0.2,
group_size_cutoff=1)
write_matchings_to(path_to_results + 'l6', l6_matchings_02_1)
def list_from_dict(combined):
l = []
for k, v in combined.items():
matchings = v.get_matchings()
for el in matchings:
l.append(el)
return l
def combine_matchings(l4, l5, l6, l42, l52, nl42, nl52, l7, ground_truth_matchings, om, cutting_ratio=0.8,
summary_threshold=1):
print("Started computation ... ")
l4_dict = dict()
for matching in l4:
l4_dict[matching] = 1
total_cancelled = 0
for m in nl42:
if m in l4_dict:
total_cancelled += 1
l4.remove(m)
l5_dict = dict()
for matching in l5:
l5_dict[matching] = 1
total_cancelled = 0
for m in nl52:
if m in l5_dict:
total_cancelled += 1
l5.remove(m)
l6_dict = dict()
for matching in l6:
l6_dict[matching] = 1
# curate l42 with l6
removed_l42 = 0
for m in l42:
if m not in l6_dict:
removed_l42 += 1
l42.remove(m)
print("rem-l42: " + str(removed_l42))
# curate l52 with l6
# (('chemical', 'activity_stds_lookup', 'std_act_id'), ('efo', 'Metabolomic Profiling'))
# (('chemical', 'activity_stds_lookup', '_'), ('efo', 'Experimental Factor'))
removed_l52 = 0
for m in l52:
db, relation, attr = m[0]
el = ((db, relation, '_'), m[1])
if el not in l6_dict:
removed_l52 += 1
l52.remove(m)
print("rem-l52: " + str(removed_l52))
all_matchings = defaultdict(list)
all_matchings[MatchingType.L4_CLASSNAME_RELATIONNAME_SYN] = l4
all_matchings[MatchingType.L5_CLASSNAME_ATTRNAME_SYN] = l5
all_matchings[MatchingType.L42_CLASSNAME_RELATIONNAME_SEM] = l42
all_matchings[MatchingType.L52_CLASSNAME_ATTRNAME_SEM] = l52
all_matchings[MatchingType.L7_CLASSNAME_ATTRNAME_FUZZY] = l7
combined = matcherlib.combine_matchings(all_matchings)
combined_list = list_from_dict(combined)
print("StructS ... ")
combined_sum = matcherlib.summarize_matchings_to_ancestor(om, combined_list,
threshold_to_summarize=summary_threshold,
summary_ratio=cutting_ratio)
precision_sum, recall_sum = compute_pr_matchings(ground_truth_matchings, combined_sum)
print("Precision: {}\nRecall: {}".format(precision_sum, recall_sum))
return precision_sum, recall_sum
def combine_and_report_results(om, path_to_raw_data, path_to_ground_truth_file):
# Getting ground truth
with open(path_to_ground_truth_file, 'r') as gt:
ground_truth_matchings_strings = gt.readlines()
def parse_strings(list_of_strings):
# format is: db %%% table %%% attr ==>> onto %%% class_name %%% list_of_matchers
matchings = []
for l in list_of_strings:
tokens = l.split("==>>")
sch = tokens[0]
cla = tokens[1]
sch_tokens = sch.split("%%%")
sch_tokens = [t.strip() for t in sch_tokens]
cla_tokens = cla.split("%%%")
cla_tokens = [t.strip() for t in cla_tokens]
matching_format = (((sch_tokens[0], sch_tokens[1], sch_tokens[2]), (cla_tokens[0], cla_tokens[1])))
matchings.append(matching_format)
return matchings
ground_truth_matchings = parse_strings(ground_truth_matchings_strings)
neg_l42 = read(path_to_raw_data + "neg_l42")
neg_l52 = read(path_to_raw_data + "neg_l52")
l6 = read(path_to_raw_data + "l6")
l42 = read(path_to_raw_data + "l42")
l52 = read(path_to_raw_data + "l52")
l4 = read(path_to_raw_data + "l4")
l5 = read(path_to_raw_data + "l5")
l7 = read(path_to_raw_data + "l7")
precision, recall = combine_matchings(l4, l5, l6, l42, l52, neg_l42, neg_l52, l7, ground_truth_matchings, om)
return precision, recall
if __name__ == "__main__":
"""
argv[1] - path to serialized model
argv[2] - ontology name
argv[3] - path to ontology
argv[4] - path to semantic model
argv[5] - path to output folder for generating the matchings
argv[6] - path to gold standard
Example: python run_semprop models/chembl22/ efo cache_onto/efo.pkl glove/glove.6B.100d.txt raw/ gold_standard
"""
if len(argv) < 6:
raise RuntimeError("Not enough arguments\nUsage: " +
"python run_semprop path_to_serialized_model onto_name path_to_ontology path_to_sem_model" +
"path_to_results path_to_gold_standard")
path_to_serialized_model = argv[1]
onto_name = argv[2]
path_to_ontology = argv[3]
path_to_sem_model = argv[4]
path_to_results = argv[5]
path_to_gold_standard = argv[6]
# Deserialize model
network = fieldnetwork.deserialize_network(path_to_serialized_model)
# Create client
store_client = StoreHandler()
# Load glove model
print("Loading language model...")
glove_api.load_model(path_to_sem_model)
print("Loading language model...OK")
# Retrieve indexes
schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl')
content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl')
# Create ontomatch api
om = SSAPI(network, store_client, schema_sim_index, content_sim_index)
# Load parsed ontology
om.add_krs([(onto_name, path_to_ontology)], parsed=True)
# # Build content sim
om.priv_build_content_sim(0.6)
print("Benchmarking matchers and linkers")
generate_matchings(network, store_client, om, path_to_results)
precision, recall = combine_and_report_results(om, path_to_results, path_to_gold_standard)
print("F1-score: {}".format(2 * precision * recall / (precision + recall)))