-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsimilarity.py
229 lines (171 loc) · 6.7 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import sys
import os
import pandas as pd
import itertools
# import collections
# import merkle_tree
import numpy as np
from glob import glob
from tqdm import tqdm_notebook
import numpy.testing as npt
from datasketch import MinHash
# TODO: clean up project structure
sys.path.append('../merkle/')
sys.path.append('../lsh_forest/')
# Read directory path and glob pattern and retrun a dict of dataframes
# for each file inside
def load_dataset_dir(dirpath, glob_pattern, **kwargs):
dataset = {}
for filename in glob(dirpath+glob_pattern):
dataset[os.path.basename(filename)] = pd.read_csv(filename, **kwargs)
return dataset
# Compute pairwise similarity metrics of dataset dict using similarity_metric
# returns reverse sorted list of (pair1, pair2, similarity_score) tuples
def get_pairwise_similarity(dataset, similarity_metric, threshold=-1.0):
pairwise_similarity = []
pairs = list(itertools.combinations(dataset.keys(), 2))
for d1, d2 in tqdm_notebook(pairs, desc='graph pairs', leave=False):
score = similarity_metric(dataset[d1], dataset[d2])
if score >= threshold:
pairwise_similarity.append((d1, d2, score))
else:
pass
#print("WARNING: DROPPING",d1,d2, score, threshold)
pairwise_similarity.sort(key=lambda x: x[2], reverse=True)
return pairwise_similarity
# Jaccard Functions
# Compute Raw Jaccard Similarity between two dataframes
# SLOWWW
def get_jaccard_coefficient_slow(df1, df2):
rowsize = max(df1.shape[0], df2.shape[0])
colsize = max(df1.shape[1], df2.shape[1])
# print total
intersection = 0.0
for i in range(rowsize):
for j in range(colsize):
try:
try:
npt.assert_equal(df1.iloc[i][j], df2.iloc[i][j])
intersection += 1
except AssertionError as e:
pass
except IndexError as e:
pass
# print intersection
union = (df1.size + df2.size) - intersection
return intersection / union
# FASTER but FillNa might pose problems.
def get_jaccard_coefficient(df1, df2):
minshape = np.minimum(df1.shape, df2.shape)
iM = np.equal(df1.fillna(np.NINF).values[:minshape[0], :minshape[1]],
df2.fillna(np.NINF).values[:minshape[0], :minshape[1]])
intersection = np.sum(iM)
union = (df1.size + df2.size) - intersection
return float(intersection) / union
def get_minhash_coefficient(df1,df2):
pass
#Assumes corresponding column names are same and PK refers to same column.
def compute_jaccard_DF(df1,df2, pk_col_name=None):
# fill NaN values in df1, df2 to some token val
df1 = df1.fillna('jac_tmp_NA')
df2 = df2.fillna('jac_tmp_NA')
if(pk_col_name):
df3 = df1.merge(df2, how='outer', on=pk_col_name, suffixes=['_jac_tmp_1','_jac_tmp_2'])
else:
df3 = df1.merge(df2, how='outer', left_index=True, right_index=True, suffixes=['_jac_tmp_1','_jac_tmp_2'])
# Get set of column column names:
comparison_cols = set(col for col in df3.columns if'_jac_tmp_' in str(col))
common_cols = set(col.split('_jac_tmp_',1)[0] for col in comparison_cols)
if(len(common_cols) == 0):
return 0
# Get set of non-common columns:
uniq_cols = set(col for col in df3.columns if'_jac_tmp_' not in str(col))
if(pk_col_name):
uniq_cols.remove(pk_col_name)
# Check common cols and print True/False
for col in common_cols:
left = col+'_jac_tmp_1'
right = col+'_jac_tmp_2'
df3[col] = df3[left] == df3[right]
# Unique columns are already false
for col in uniq_cols:
df3[col] = False
#Drop superflous columns
df3 = df3.drop(columns=comparison_cols)
if(pk_col_name):
df3 = df3.drop(columns=[pk_col_name])
# Compute Jaccard Similarity
intersection = np.sum(np.sum(df3))
union = df3.size
return float(intersection) / union
#Assumes corresponding column names and valid indices in both data frames
def compute_jaccard_DF_index(df1,df2):
# fill NaN values in df1, df2 to some token val
df1 = df1.fillna('jac_tmp_NA')
df2 = df2.fillna('jac_tmp_NA')
df3 = df1.merge(df2, how='outer', left_index=True, right_index=True, suffixes=['_jac_tmp_1','_jac_tmp_2'])
# Get set of column column names:
comparison_cols = set(col for col in df3.columns if'_jac_tmp_' in str(col))
common_cols = set(col.split('_jac_tmp_',1)[0] for col in comparison_cols)
if(len(common_cols) == 0):
return 0
# Get set of non-common columns:
uniq_cols = set(col for col in df3.columns if'_jac_tmp_' not in str(col))
# Check common cols and print True/False
for col in common_cols:
try:
left = col+'_jac_tmp_1'
right = col+'_jac_tmp_2'
df3[col] = df3[left] == df3[right]
except Exception as e:
print(col, left, right)
print(df3[left] == df3[right])
raise e
# Unique columns are already false
for col in uniq_cols:
df3[col] = False
#Drop superflous columns
df3 = df3.drop(columns=comparison_cols)
# Compute Jaccard Similarity
intersection = np.sum(np.sum(df3))
union = df3.size
if(union == 0):
return 0.0
del(df3)
return float(intersection) / union
# Assumes corresponding column names and valid indices in both data frames
def compute_jaccard_DF_reindex(df1,df2):
# Empty DF check
# fill NaN values in df1, df2 to some token val
df1 = df1.fillna('jac_tmp_NA').reset_index(drop=True)
df2 = df2.fillna('jac_tmp_NA').reset_index(drop=True)
df3 = df1.merge(df2, how='outer', left_index=True, right_index=True, suffixes=['_jac_tmp_1','_jac_tmp_2'])
# Get set of column column names:
comparison_cols = set(col for col in df3.columns if'_jac_tmp_' in str(col))
common_cols = set(col.split('_jac_tmp_',1)[0] for col in comparison_cols)
if(len(common_cols) == 0):
return 0.0
# Get set of non-common columns:
uniq_cols = set(col for col in df3.columns if'_jac_tmp_' not in str(col))
# Check common cols and print True/False
for col in common_cols:
try:
left = col+'_jac_tmp_1'
right = col+'_jac_tmp_2'
df3[col] = df3[left] == df3[right]
except Exception as e:
print(col, left, right)
print(df3[left] == df3[right])
raise e
# Unique columns are already false
for col in uniq_cols:
df3[col] = False
#Drop superflous columns
df3 = df3.drop(columns=comparison_cols)
# Compute Jaccard Similarity
intersection = np.sum(np.sum(df3))
union = df3.size
if(union == 0):
return 0.0
del(df3)
return float(intersection) / union