add batch matching method addressing #62

delftdata · Oct 13, 2023 · 3dfe13e · 3dfe13e
1 parent afb20cb
commit 3dfe13e
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -5,6 +5,7 @@
 [![PyPI version](https://badge.fury.io/py/valentine.svg)](https://badge.fury.io/py/valentine)
 ![PyPI - Downloads](https://img.shields.io/pypi/dm/valentine)
 [![Python 3.8+](https://img.shields.io/badge/python-3.8|3.9|3.10|3.11-blue.svg)](https://www.python.org/downloads/release/python-380/)
+[![Codacy Badge](https://app.codacy.com/project/badge/Grade/85cfebfc9c6a43359c5b2e56a5fdf3a3)](https://app.codacy.com/gh/delftdata/valentine/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)
 
 A python package for capturing potential relationships among columns of different tabular datasets, which are given in the form of pandas DataFrames. Valentine is based on [Valentine: Evaluating Matching Techniques for Dataset Discovery](https://ieeexplore.ieee.org/abstract/document/9458921)
 
@@ -70,15 +71,25 @@ In order to do so, the user can choose one of the following 5 matching methods:
      * **coeff_policy**(*str*) - Policy for deciding the weight coefficients of the propagation graph. Choice of "inverse\_product" or "inverse\_average" (default).
      * **formula**(*str*) - Formula on which iterative fixpoint computation is based. Choice of "basic", "formula\_a", "formula\_b" and "formula\_c" (default).
 
-### Matching DataFrames
+### Matching DataFrame Pair
 
-After selecting one of the 5 matching methods, the user can initiate the matching process in the following way:
+After selecting one of the 5 matching methods, the user can initiate the pairwise matching process in the following way:
 
 ```python
 matches = valentine_match(df1, df2, matcher, df1_name, df2_name)
 ```
 
-where df1 and df2 are the two pandas DataFrames for which we want to find matches and matcher is one of Coma, Cupid, DistributionBased, JaccardLevenMatcher or SimilarityFlooding. The user can also input a name for each DataFrame (defaults are "table\_1" and "table\_2"). Function ```valentine_match``` returns a dictionary storing as keys column pairs from the two DataFrames and as keys the corresponding similarity scores.
+where df1 and df2 are the two pandas DataFrames for which we want to find matches and matcher is one of Coma, Cupid, DistributionBased, JaccardLevenMatcher or SimilarityFlooding. The user can also input a name for each DataFrame (defaults are "table\_1" and "table\_2"). Function ```valentine_match``` returns a dictionary storing as keys column pairs from the two DataFrames and as values the corresponding similarity scores.
+
+### Matching DataFrame Batch
+
+After selecting one of the 5 matching methods, the user can initiate the batch matching process in the following way:
+
+```python
+matches = valentine_match_batch(df_iter_1, df_iter_2, matcher, df_iter_1_names, df_iter_2_names)
+```
+
+where df_iter_1 and df_iter_2 are the two iterable structures containing pandas DataFrames for which we want to find matches and matcher is one of Coma, Cupid, DistributionBased, JaccardLevenMatcher or SimilarityFlooding. The user can also input an iterable with names for each DataFrame. Function ```valentine_match_batch``` returns a dictionary storing as keys column pairs from the DataFrames and as values the corresponding similarity scores.
 
 ### Measuring effectiveness
 

diff --git a/tests/test_valentine.py b/tests/test_valentine.py
@@ -2,9 +2,9 @@
 
 from valentine.data_sources import DataframeTable
 
-from valentine import valentine_match, valentine_metrics, NotAValentineMatcher
+from valentine import valentine_match, valentine_match_batch, valentine_metrics, NotAValentineMatcher
 from tests import df1, df2
-from valentine.algorithms import Coma
+from valentine.algorithms import Coma, DistributionBased
 
 
 class TestValentine(unittest.TestCase):
@@ -28,3 +28,21 @@ def test_metrics(self):
                            ('EID', 'EID')]
         metrics = valentine_metrics.all_metrics(matches, golden_standard)
         assert metrics['recall_at_sizeof_ground_truth'] == 1.0
+
+    def test_batch_generator(self):
+        n = 3
+
+        def generate_df1():
+            for _ in range(n):
+                yield df1
+
+        def generate_df2():
+            for _ in range(n):
+                yield df2
+
+        matches = valentine_match_batch(generate_df1(), generate_df2(), DistributionBased())
+        assert len(matches) > 0
+
+    def test_batch_list(self):
+        matches = valentine_match_batch([df1, df1, df1], [df2, df2, df2], DistributionBased())
+        assert len(matches) > 0
diff --git a/valentine/__init__.py b/valentine/__init__.py
@@ -1,3 +1,5 @@
+from typing import Iterable, List, Union
+
 import pandas as pd
 
 import valentine.metrics as valentine_metrics
@@ -9,17 +11,45 @@ class NotAValentineMatcher(Exception):
     pass
 
 
+def validate_matcher(matcher):
+    if not isinstance(matcher, valentine.algorithms.BaseMatcher):
+        raise NotAValentineMatcher('The method that you selected is not supported by Valentine')
+
+
 def valentine_match(df1: pd.DataFrame,
                     df2: pd.DataFrame,
                     matcher: valentine.algorithms.BaseMatcher,
                     df1_name: str = 'table_1',
                     df2_name: str = 'table_2'):
-    if isinstance(matcher, valentine.algorithms.BaseMatcher):
-        table_1 = valentine.data_sources.DataframeTable(df1, name=df1_name)
-        table_2 = valentine.data_sources.DataframeTable(df2, name=df2_name)
-        matches = dict(sorted(matcher.get_matches(table_1, table_2).items(),
-                              key=lambda item: item[1], reverse=True))
-    else:
-        raise NotAValentineMatcher('The method that you selected is not supported by Valentine')
+
+    validate_matcher(matcher)
+
+    table_1 = valentine.data_sources.DataframeTable(df1, name=df1_name)
+    table_2 = valentine.data_sources.DataframeTable(df2, name=df2_name)
+    matches = dict(sorted(matcher.get_matches(table_1, table_2).items(),
+                          key=lambda item: item[1], reverse=True))
+
+    return matches
+
+
+def valentine_match_batch(df_iter_1: Iterable[pd.DataFrame],
+                          df_iter_2: Iterable[pd.DataFrame],
+                          matcher: valentine.algorithms.BaseMatcher,
+                          df_iter_1_names: Union[List[str], None] = None,
+                          df_iter_2_names: Union[List[str], None] = None):
+
+    validate_matcher(matcher)
+
+    matches = {}
+
+    for df1_idx, df1 in enumerate(df_iter_1):
+        for df2_idx, df2 in enumerate(df_iter_2):
+            table_1_name = df_iter_1_names[df1_idx] if df_iter_1_names is not None else f'table_1_{df1_idx}'
+            table_2_name = df_iter_2_names[df2_idx] if df_iter_2_names is not None else f'table_2_{df2_idx}'
+            table_1 = valentine.data_sources.DataframeTable(df1, name=table_1_name)
+            table_2 = valentine.data_sources.DataframeTable(df2, name=table_2_name)
+            matches.update(matcher.get_matches(table_1, table_2))
+
+    matches = dict(sorted(matches.items(), key=lambda item: item[1], reverse=True))
 
     return matches