-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhash.py
63 lines (53 loc) · 2.41 KB
/
hash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import numpy as np
class HashTable:
def __init__(self, hash_size, inp_dimensions):
self.hash_size = hash_size
self.inp_dimensions = inp_dimensions # input data dimension
self.hash_table = dict()
self.projections = np.random.randn(self.hash_size, inp_dimensions)
def generate_hash(self, inp_vector):
bools = (np.dot(inp_vector, self.projections.T) > 0).astype('int')
return ''.join(bools.astype('str'))
def __setitem__(self, inp_vec, label):
hash_value = self.generate_hash(inp_vec)
self.hash_table[hash_value] = self.hash_table.get(hash_value, list()) + [label]
def __getitem__(self, inp_vec):
hash_value = self.generate_hash(inp_vec)
# return all items that have the same hash code
return self.hash_table.get(hash_value, [])
class LSH:
def __init__(self, num_tables, hash_size, inp_dimensions):
self.num_tables = num_tables
self.hash_size = hash_size
self.inp_dimensions = inp_dimensions
self.hash_tables = list()
for i in range(self.num_tables):
self.hash_tables.append(HashTable(self.hash_size, self.inp_dimensions))
def __setitem__(self, inp_vec, label):
for table in self.hash_tables:
table[inp_vec] = label
def __getitem__(self, inp_vec):
results = list()
for table in self.hash_tables:
results.extend(table[inp_vec])
return list(set(results))
def query_neighbor(self, inp_vec, dist=1):
# return all data such that the hash distance is within dish.
# the length of hash_code is hash_size * num_tables
same_hash_neighbors = self.__getitem__(inp_vec)
# print(f'hash_code is {same_hash_neighbors}')
return same_hash_neighbors
result = []
select_neighbor = {}
for table in self.hash_tables:
same_value_neighbor = [i for i, j in table.hash_table.items() if j == table[inp_vec]]
for i in same_value_neighbor:
if i not in select_neighbor:
select_neighbor[i] += 1
else:
select_neighbor[i] = 0
# threshold = self.hash_size -1 # only select neighbors such that the hamming distance is less than 1.
top_neighbor = [i for i, j in select_neighbor.items()]
return top_neighbor
# for i in range(hash_size):
# generate a neighbor hash code