Skip to content

Commit

Permalink
membership operator for lsh
Browse files Browse the repository at this point in the history
  • Loading branch information
ekzhu committed Apr 12, 2016
1 parent b6082f2 commit 1be2a05
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 13 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,16 @@ lsh = MinHashLSH(threshold=0.5, num_perm=128)
lsh.insert("m2", m2)
lsh.insert("m3", m3)

# Check for membership using the key
print("m2" in lsh)
print("m3" in lsh)

# Using m1 as the query, retrieve the keys of the qualifying datasets
result = lsh.query(m1)
print("Candidates with Jaccard similarity > 0.5", result)

# Remove key from lsh
lsh.remove("m2")
```

The Jaccard similarity threshold must be set at initialization, and cannot
Expand Down
10 changes: 8 additions & 2 deletions datasketch/lsh.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ def is_empty(self):
def _H(self, hs):
return "".join("%.8x" % h for h in hs)

def __contains__(self, key):
'''
Return True only if the key exists in the index.
'''
return key in self.keys

def insert(self, key, minhash):
'''
Insert a unique `key` to the index, together
Expand Down Expand Up @@ -133,9 +139,9 @@ def query(self, minhash):
candidates.add(key)
return list(candidates)

def delete(self, key):
def remove(self, key):
'''
Delete the key from the index.
Remove the key from the index.
'''
if key not in self.keys:
raise ValueError("The given key does not exist")
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# Versions should comply with PEP440. For a discussion on single-sourcing
# the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version='0.2.2',
version='0.2.3',

description='Probabilistic data structures for processing very large datasets',
long_description=long_description,
Expand Down
20 changes: 10 additions & 10 deletions test/lsh_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def test_insert(self):
items.extend(t[H])
self.assertTrue("a" in items)
self.assertTrue("b" in items)
self.assertTrue("a" in lsh.keys)
self.assertTrue("b" in lsh.keys)
self.assertTrue("a" in lsh)
self.assertTrue("b" in lsh)
for i, H in enumerate(lsh.keys["a"]):
self.assertTrue("a" in lsh.hashtables[i][H])

Expand All @@ -57,7 +57,7 @@ def test_query(self):
m3 = MinHash(18)
self.assertRaises(ValueError, lsh.query, m3)

def test_delete(self):
def test_remove(self):
lsh = MinHashLSH(threshold=0.5, num_perm=16)
m1 = MinHash(16)
m1.update("a".encode("utf8"))
Expand All @@ -66,14 +66,14 @@ def test_delete(self):
lsh.insert("a", m1)
lsh.insert("b", m2)

lsh.delete("a")
lsh.remove("a")
self.assertTrue("a" not in lsh.keys)
for table in lsh.hashtables:
for H in table:
self.assertGreater(len(table[H]), 0)
self.assertTrue("a" not in table[H])

self.assertRaises(ValueError, lsh.delete, "c")
self.assertRaises(ValueError, lsh.remove, "c")

def test_pickle(self):
lsh = MinHashLSH(threshold=0.5, num_perm=16)
Expand Down Expand Up @@ -115,8 +115,8 @@ def test_insert(self):
items.extend(t[H])
self.assertTrue("a" in items)
self.assertTrue("b" in items)
self.assertTrue("a" in lsh.keys)
self.assertTrue("b" in lsh.keys)
self.assertTrue("a" in lsh)
self.assertTrue("b" in lsh)
for i, H in enumerate(lsh.keys["a"]):
self.assertTrue("a" in lsh.hashtables[i][H])

Expand All @@ -140,22 +140,22 @@ def test_query(self):
m3 = mg.minhash(np.random.uniform(1, 10, 10))
self.assertRaises(ValueError, lsh.query, m3)

def test_delete(self):
def test_remove(self):
lsh = WeightedMinHashLSH(threshold=0.5, sample_size=4)
mg = WeightedMinHashGenerator(10, 4)
m1 = mg.minhash(np.random.uniform(1, 10, 10))
m2 = mg.minhash(np.random.uniform(1, 10, 10))
lsh.insert("a", m1)
lsh.insert("b", m2)

lsh.delete("a")
lsh.remove("a")
self.assertTrue("a" not in lsh.keys)
for table in lsh.hashtables:
for H in table:
self.assertGreater(len(table[H]), 0)
self.assertTrue("a" not in table[H])

self.assertRaises(ValueError, lsh.delete, "c")
self.assertRaises(ValueError, lsh.remove, "c")

def test_pickle(self):
lsh = WeightedMinHashLSH(threshold=0.5, sample_size=4)
Expand Down

0 comments on commit 1be2a05

Please sign in to comment.