Skip to content

Commit 5786439

Browse files
authored
Function to compute Dice coefficients of bitarray pairs (#567)
* Function in anonlink.similarities to compute the Dice coefficient on pairs of bitarrays * Add changelog entry cleanup PR * Test with all zeros * Remove ubuntu-18.04 unittests
1 parent cdca890 commit 5786439

File tree

7 files changed

+109
-3
lines changed

7 files changed

+109
-3
lines changed

.github/workflows/unittests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
runs-on: ${{ matrix.os }}
1616
strategy:
1717
matrix:
18-
os: [macos-latest, windows-latest, ubuntu-18.04, ubuntu-20.04]
18+
os: [macos-latest, windows-latest, ubuntu-20.04]
1919
python: ["3.8", "3.9", "3.10", "3.11"]
2020

2121
steps:

CHANGELOG.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
0.15.3
2+
======
3+
4+
- Added function to compute Dice coefficients of bitarray pairs. #567
5+
16
0.15.2
27
======
38

anonlink/similarities/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
functions are possible as well.
1111
"""
1212

13-
from anonlink.similarities._dice_python import dice_coefficient_python
13+
from anonlink.similarities._dice_python import (dice_coefficient_python,
14+
dice_coefficient_pairs_python)
1415
from anonlink.similarities._smc import (hamming_similarity,
1516
simple_matching_coefficient)
1617

anonlink/similarities/_dice_python.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from itertools import repeat
33
from typing import Iterable, Optional, Sequence, Tuple
44

5+
import numpy as np
56
from bitarray import bitarray
67

78
from anonlink.similarities._utils import (sort_similarities_inplace,
@@ -77,3 +78,43 @@ def dice_coefficient_python(
7778
sort_similarities_inplace(result_sims, result_indices0, result_indices1)
7879

7980
return result_sims, (result_indices0, result_indices1)
81+
82+
83+
84+
def dice_coefficient_pairs_python(
85+
datasets: Sequence[Tuple[bitarray, bitarray]]
86+
):
87+
"""Find Dice coefficients of bitarray pairs.
88+
89+
This version is written in Python, so it does not rely on
90+
architecture-specific instructions. It may be slower than an
91+
accelerated version.
92+
93+
A similarity is computed for every pair of bitarrays in the input
94+
datasets, the similarity for each pair is returned as a floating-point
95+
value.
96+
97+
:param datasets: A sequence of candidate pairs. Each pair in a tuple
98+
of bitarrays.
99+
100+
:return: Similarity scores for every input pair as an array of
101+
floating-point values.
102+
"""
103+
candidate_pair_count = len(datasets)
104+
105+
# Preallocate the result array.
106+
result_sims = np.zeros(candidate_pair_count, dtype=np.float64)
107+
108+
for i, (f0, f1) in enumerate(datasets):
109+
f0_count = f0.count()
110+
f1_count = f1.count()
111+
combined_count = f0_count + f1_count
112+
113+
if combined_count:
114+
score: float = (2.0 * (f0 & f1).count() / combined_count)
115+
else: # Avoid division by zero.
116+
score = 0.0
117+
118+
result_sims[i] = score
119+
120+
return result_sims

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868

6969
setup(
7070
name="anonlink",
71-
version='0.15.2',
71+
version='0.15.3',
7272
description='Anonymous linkage using cryptographic hashes and bloom filters',
7373
long_description=readme,
7474
long_description_content_type='text/x-rst',

tests/test_e2e.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,5 +273,48 @@ def test_greedy_chunked_matching_works(self):
273273
assert mapping == merged_mapping
274274

275275

276+
277+
class TestSimilarityStream(EntityHelperMixin, unittest.TestCase):
278+
279+
proportion = 0.8
280+
sample = 150
281+
282+
def setUp(self):
283+
self.nl = randomnames.NameList(300)
284+
self.s1, self.s2 = self.nl.generate_subsets(self.sample, self.proportion)
285+
self.key_lists = generate_key_lists('secret', len(self.nl.schema_types))
286+
self.f1 = tuple(map(itemgetter(0),
287+
bloomfilter.stream_bloom_filters(
288+
self.s1, self.key_lists, self.nl.SCHEMA)))
289+
self.f2 = tuple(map(itemgetter(0),
290+
bloomfilter.stream_bloom_filters(
291+
self.s2, self.key_lists, self.nl.SCHEMA)))
292+
293+
def test_similarity_stream(self):
294+
candidate_pairs = []
295+
for f1 in self.f1:
296+
for f2 in self.f2:
297+
candidate_pairs.append((f1, f2))
298+
299+
similarity_stream = anonlink.similarities.dice_coefficient_pairs_python(
300+
candidate_pairs
301+
)
302+
303+
assert len(similarity_stream) == len(self.f1) * len(self.f2)
304+
305+
candidate_pairs = anonlink.candidate_generation.find_candidate_pairs(
306+
(self.f1, self.f2),
307+
anonlink.similarities.dice_coefficient_accelerated,
308+
threshold=0.0,
309+
)
310+
311+
scores, _, (l_indicies, r_indicies) = candidate_pairs
312+
313+
for score, l_index, r_index in zip(scores, l_indicies, r_indicies):
314+
# Calculate the index in the streamed candidate pairs list
315+
index = l_index * len(self.f2) + r_index
316+
assert similarity_stream[index] == score
317+
318+
276319
if __name__ == '__main__':
277320
unittest.main()

tests/test_similarity_dice.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from clkhash.key_derivation import generate_key_lists
55
from hypothesis import given, strategies
66

7+
import anonlink.similarities
78
from anonlink import similarities
89

910
FLOAT_ARRAY_TYPES = 'fd'
@@ -258,6 +259,21 @@ def test_all_low(self, sim_fun, k, threshold):
258259
assert (rec_is0.typecode in UINT_ARRAY_TYPES
259260
and rec_is1.typecode in UINT_ARRAY_TYPES)
260261

262+
def test_candidate_stream_right_low(self):
263+
datasets = list(zip(*[[bitarray('01001011') * 8],
264+
[bitarray('00000000') * 8]]))
265+
sims = anonlink.similarities.dice_coefficient_pairs_python(datasets)
266+
assert len(sims) == 1
267+
assert all(s == 0.0 for s in sims)
268+
269+
def test_candidate_stream_all_low(self):
270+
datasets = list(zip(*[[bitarray('00000000') * 8],
271+
[bitarray('00000000') * 8]]))
272+
sims = anonlink.similarities.dice_coefficient_pairs_python(datasets)
273+
274+
assert len(sims) == 1
275+
assert all(s == 0.0 for s in sims)
276+
261277
@pytest.mark.parametrize('sim_fun', SIM_FUNS)
262278
def test_order(self, sim_fun):
263279
similarity = sim_fun(

0 commit comments

Comments
 (0)