Skip to content

Commit

Permalink
Fix for one_to_one_matches incorrect median (#56)
Browse files Browse the repository at this point in the history
* Ensure set of values is sorted before taking median

* Write test case for one_to_one_matches

* Add reverse keyword

* Also reverse the sort order in the test..

* Remove unused import and trailing whitespace

---------

Co-authored-by: Shaad Alaka <[email protected]>
  • Loading branch information
Archer6621 and Shaad Alaka committed Oct 10, 2023
1 parent 3bf9264 commit 06fdc82
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 1 deletion.
47 changes: 47 additions & 0 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import unittest

import math
from valentine.metrics.metrics import one_to_one_matches
from copy import deepcopy

matches = {
(('table_1', 'Cited by'), ('table_2', 'Cited by')): 0.8374313,
(('table_1', 'Authors'), ('table_2', 'Authors')): 0.83498037,
(('table_1', 'EID'), ('table_2', 'EID')): 0.8214057,
}

ground_truth = [
('Cited by', 'Cited by'),
('Authors', 'Authors'),
('EID', 'EID')
]


class TestMetrics(unittest.TestCase):

def test_one_to_one(self):
m = deepcopy(matches)

# Add multiple matches per column
pairs = list(m.keys())
for ((ta, ca), (tb, cb)) in pairs:
m[((ta, ca), (tb, cb + 'foo'))] = m[((ta, ca), (tb, cb))] / 2

# Verify that len gets corrected to 3
m_one_to_one = one_to_one_matches(m)
assert len(m_one_to_one) == 3 and len(m) == 6

# Verify that none of the lower similarity "foo" entries made it
for ((ta, ca), (tb, cb)) in pairs:
assert ((ta, ca), (tb, cb + 'foo')) not in m_one_to_one

# Add one new entry with lower similarity
m_entry = deepcopy(matches)
m_entry[(('table_1', 'BLA'), ('table_2', 'BLA'))] = 0.7214057

m_entry_one_to_one = one_to_one_matches(m_entry)

# Verify that all remaining values are above the median
median = sorted(set(m_entry.values()), reverse=True)[math.ceil(len(m_entry)/2)]
for k in m_entry_one_to_one:
assert m_entry_one_to_one[k] >= median
2 changes: 1 addition & 1 deletion valentine/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def one_to_one_matches(matches: dict):
matched[key[0]] = False
matched[key[1]] = False

median = list(set_match_values)[math.ceil(len(set_match_values)/2)]
median = sorted(set_match_values, reverse=True)[math.ceil(len(set_match_values)/2)]

matches1to1 = dict()

Expand Down

0 comments on commit 06fdc82

Please sign in to comment.