delftdata · ThanosTsiamis · May 10, 2024
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -13,26 +13,26 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v2
+      - uses: actions/checkout@v2
 
-    - name: Install Valentine
-      run: pip install .
+      - name: Install Valentine
+        run: pip install .
 
-    - name: Install Coverage
-      run: pip install coverage
+      - name: Install Coverage
+        run: pip install coverage
 
-    - name: Install Pytest
-      run: pip install pytest==8.2.0
+      - name: Install Pytest
+        run: pip install pytest==8.2.0
 
-    - name: Run Tests
-      run: coverage run --source=valentine -m pytest tests/
+      - name: Run Tests
+        run: coverage run --source=valentine -m pytest tests/
 
-    - name: Generate Coverage Report
-      run: coverage xml
+      - name: Generate Coverage Report
+        run: coverage xml
 
-    - name: "Upload coverage to Codecov"
-      uses: codecov/codecov-action@v4
-      with:
-        fail_ci_if_error: true
-        token: ${{ secrets.CODECOV_TOKEN }}
-        verbose: true
+      - name: "Upload coverage to Codecov"
+        uses: codecov/codecov-action@v4
+        with:
+          fail_ci_if_error: true
+          token: ${{ secrets.CODECOV_TOKEN }}
+          verbose: true
diff --git a/.github/workflows/build_all_os.yml b/.github/workflows/build_all_os.yml
@@ -11,8 +11,8 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
-        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
+        python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12' ]
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python

diff --git a/README.md b/README.md
@@ -7,66 +7,81 @@
 [![Python 3.8+](https://img.shields.io/badge/python-3.8|3.9|3.10|3.11|3.12-blue.svg)](https://www.python.org/downloads/release/python-380/)
 [![Codacy Badge](https://app.codacy.com/project/badge/Grade/85cfebfc9c6a43359c5b2e56a5fdf3a3)](https://app.codacy.com/gh/delftdata/valentine/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)
 
-A python package for capturing potential relationships among columns of different tabular datasets, which are given in the form of pandas DataFrames. Valentine is based on [Valentine: Evaluating Matching Techniques for Dataset Discovery](https://ieeexplore.ieee.org/abstract/document/9458921)
+A python package for capturing potential relationships among columns of different tabular datasets, which are given in
+the form of pandas DataFrames. Valentine is based
+on [Valentine: Evaluating Matching Techniques for Dataset Discovery](https://ieeexplore.ieee.org/abstract/document/9458921)
 
 You can find more information about the research supporting Valentine [here](https://delftdata.github.io/valentine/).
 
 ## Experimental suite version
 
-The original experimental suite version of Valentine, as first published for the needs of the research paper, can be still found [here](https://github.com/delftdata/valentine/tree/v1.1).
+The original experimental suite version of Valentine, as first published for the needs of the research paper, can be
+still found [here](https://github.com/delftdata/valentine/tree/v1.1).
 
 ## Installation instructions
+
 ### Requirements
 
-*   *Python* >=3.8,<3.13
-*   *Java*: For the Coma matcher it is required to have java (jre) installed
+* *Python* >=3.8,<3.13
+* *Java*: For the Coma matcher it is required to have java (jre) installed
 
 To install Valentine simply run:
 
 ```shell
 pip install valentine
 ```
 
-
 ## Usage
-Valentine can be used to find matches among columns of a given pair of pandas DataFrames. 
+
+Valentine can be used to find matches among columns of a given pair of pandas DataFrames.
 
 ### Matching methods
+
 In order to do so, the user can choose one of the following 5 matching methods:
 
-1.   `Coma(int: max_n, bool: use_instances, str: java_xmx)` is a python wrapper around [COMA 3.0 Comunity edition](https://sourceforge.net/projects/coma-ce/)
-     *    **Parameters**: 
-           *    **max_n**(*int*) - Accept similarity threshold, (default: 0).
-           *    **use_instances**(*bool*) - Wheather Coma will make use of the data instances or just the schema information, (default: False).
-           *    **java_xmx**(*str*) - The amount of RAM that Coma is allowed to use, (default: "1024m") .
-
-2.   `Cupid(float: w_struct, float: leaf_w_struct, float: th_accept)` is the python implementation of the paper [Generic Schema Matching with Cupid](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.79.4079&rep=rep1&type=pdf)
-     *    **Parameters**:
-          *    **w_struct**(*float*) - Structural similarity threshold, default is 0.2.
-          *    **leaf_w_struct**(*float*) - Structural similarity threshold, leaf level, default is 0.2.
-          *    **th_accept**(*float*) - Accept similarity threshold, default is 0.7.
-
-3.   `DistributionBased(float: threshold1, float: threshold2)` is the python implementation of the paper [Automatic Discovery of Attributes in Relational Databases](https://dl-acm-org.tudelft.idm.oclc.org/doi/pdf/10.1145/1989323.1989336)
-     *    **Parameters**: 
-          *    **threshold1**(*float*) - The threshold for phase 1 of the method, default is 0.15.
-          *    **threshold2**(*float*) - The threshold for phase 2 of the method, default is 0.15.
-
-4.   `JaccardDistanceMatcher(float: threshold_dist)` is a baseline method that uses Jaccard Similarity between columns to assess their correspondence score, optionally enhanced by a string similarity measure of choice.
-     *    **Parameters**: 
-          *    **threshold_dist**(*float*) - Acceptance threshold for assessing two strings as equal, default is 0.8.
-
-          *    **distance_fun**(*StringDistanceFunction*) - String similarity function used to assess whether two strings are equal. The enumeration class type `StringDistanceFunction` can be imported from `valentine.algorithms.jaccard_distance`. Functions currently supported are:  
-   		       * `StringDistanceFunction.Levenshtein`: [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
-               * `StringDistanceFunction.DamerauLevenshtein`: [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
-               * `StringDistanceFunction.Hamming`: [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance)
-               * `StringDistanceFunction.Jaro`: [Jaro distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
-               * `StringDistanceFunction.JaroWinkler`: [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
+1. `Coma(int: max_n, bool: use_instances, str: java_xmx)` is a python wrapper
+   around [COMA 3.0 Comunity edition](https://sourceforge.net/projects/coma-ce/)
+    * **Parameters**:
+        * **max_n**(*int*) - Accept similarity threshold, (default: 0).
+        * **use_instances**(*bool*) - Wheather Coma will make use of the data instances or just the schema
+          information, (default: False).
+        * **java_xmx**(*str*) - The amount of RAM that Coma is allowed to use, (default: "1024m") .
+
+2. `Cupid(float: w_struct, float: leaf_w_struct, float: th_accept)` is the python implementation of the
+   paper [Generic Schema Matching with Cupid](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.79.4079&rep=rep1&type=pdf)
+    * **Parameters**:
+        * **w_struct**(*float*) - Structural similarity threshold, default is 0.2.
+        * **leaf_w_struct**(*float*) - Structural similarity threshold, leaf level, default is 0.2.
+        * **th_accept**(*float*) - Accept similarity threshold, default is 0.7.
+
+3. `DistributionBased(float: threshold1, float: threshold2)` is the python implementation of the
+   paper [Automatic Discovery of Attributes in Relational Databases](https://dl-acm-org.tudelft.idm.oclc.org/doi/pdf/10.1145/1989323.1989336)
+    * **Parameters**:
+        * **threshold1**(*float*) - The threshold for phase 1 of the method, default is 0.15.
+        * **threshold2**(*float*) - The threshold for phase 2 of the method, default is 0.15.
+
+4. `JaccardDistanceMatcher(float: threshold_dist)` is a baseline method that uses Jaccard Similarity between columns to
+   assess their correspondence score, optionally enhanced by a string similarity measure of choice.
+    * **Parameters**:
+        * **threshold_dist**(*float*) - Acceptance threshold for assessing two strings as equal, default is 0.8.
+
+        * **distance_fun**(*StringDistanceFunction*) - String similarity function used to assess whether two strings are
+          equal. The enumeration class type `StringDistanceFunction` can be imported
+          from `valentine.algorithms.jaccard_distance`. Functions currently supported are:
+            * `StringDistanceFunction.Levenshtein`: [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
+            * `StringDistanceFunction.DamerauLevenshtein`: [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
+            * `StringDistanceFunction.Hamming`: [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance)
+            * `StringDistanceFunction.Jaro`: [Jaro distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
+            * `StringDistanceFunction.JaroWinkler`: [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
               * `StringDistanceFunction.Exact`: String equality `==`
 
-5.   `SimilarityFlooding(str: coeff_policy, str: formula)` is the python implementation of the paper [Similarity Flooding: A Versatile Graph Matching Algorithmand its Application to Schema Matching](http://p8090-ilpubs.stanford.edu.tudelft.idm.oclc.org/730/1/2002-1.pdf)
-     * **Parameters**: 
-        *    **coeff_policy**(*str*) - Policy for deciding the weight coefficients of the propagation graph. Choice of "inverse\_product" or "inverse\_average" (default).
-        *    **formula**(*str*) - Formula on which iterative fixpoint computation is based. Choice of "basic", "formula\_a", "formula\_b" and "formula\_c" (default).
+5. `SimilarityFlooding(str: coeff_policy, str: formula)` is the python implementation of the
+   paper [Similarity Flooding: A Versatile Graph Matching Algorithmand its Application to Schema Matching](http://p8090-ilpubs.stanford.edu.tudelft.idm.oclc.org/730/1/2002-1.pdf)
+    * **Parameters**:
+        * **coeff_policy**(*str*) - Policy for deciding the weight coefficients of the propagation graph. Choice of "
+          inverse\_product" or "inverse\_average" (default).
+        * **formula**(*str*) - Formula on which iterative fixpoint computation is based. Choice of "basic", "
+          formula\_a", "formula\_b" and "formula\_c" (default).
 
 ### Matching DataFrame Pair
 
@@ -76,7 +91,11 @@ After selecting one of the 5 matching methods, the user can initiate the pairwis
 matches = valentine_match(df1, df2, matcher, df1_name, df2_name)
 ```
 
-where df1 and df2 are the two pandas DataFrames for which we want to find matches and matcher is one of Coma, Cupid, DistributionBased, JaccardLevenMatcher or SimilarityFlooding. The user can also input a name for each DataFrame (defaults are "table\_1" and "table\_2"). Function ```valentine_match``` returns a MatcherResults object, which is a dictionary with additional convenience methods, such as `one_to_one`, `take_top_percent`, `get_metrics` and more. It stores as keys column pairs from the two DataFrames and as values the corresponding similarity scores.
+where df1 and df2 are the two pandas DataFrames for which we want to find matches and matcher is one of Coma, Cupid,
+DistributionBased, JaccardLevenMatcher or SimilarityFlooding. The user can also input a name for each DataFrame (
+defaults are "table\_1" and "table\_2"). Function ```valentine_match``` returns a MatcherResults object, which is a
+dictionary with additional convenience methods, such as `one_to_one`, `take_top_percent`, `get_metrics` and more. It
+stores as keys column pairs from the two DataFrames and as values the corresponding similarity scores.
 
 ### Matching DataFrame Batch
 
@@ -86,11 +105,19 @@ After selecting one of the 5 matching methods, the user can initiate the batch m
 matches = valentine_match_batch(df_iter_1, df_iter_2, matcher, df_iter_1_names, df_iter_2_names)
 ```
 
-where df_iter_1 and df_iter_2 are the two iterable structures containing pandas DataFrames for which we want to find matches and matcher is one of Coma, Cupid, DistributionBased, JaccardLevenMatcher or SimilarityFlooding. The user can also input an iterable with names for each DataFrame. Function ```valentine_match_batch``` returns a MatcherResults object, which is a dictionary with additional convenience methods, such as `one_to_one`, `take_top_percent`, `get_metrics` and more. It stores as keys column pairs from the two DataFrames and as values the corresponding similarity scores.
-
+where df_iter_1 and df_iter_2 are the two iterable structures containing pandas DataFrames for which we want to find
+matches and matcher is one of Coma, Cupid, DistributionBased, JaccardLevenMatcher or SimilarityFlooding. The user can
+also input an iterable with names for each DataFrame. Function ```valentine_match_batch``` returns a MatcherResults
+object, which is a dictionary with additional convenience methods, such
+as `one_to_one`, `take_top_percent`, `get_metrics` and more. It stores as keys column pairs from the two DataFrames and
+as values the corresponding similarity scores.
 
 ### MatcherResults instance
-The `MatcherResults` instance has some convenience methods that the user can use to either obtain a subset of the data or to transform the data. This instance is a dictionary and is sorted upon instantiation, from high similarity to low similarity.
+
+The `MatcherResults` instance has some convenience methods that the user can use to either obtain a subset of the data
+or to transform the data. This instance is a dictionary and is sorted upon instantiation, from high similarity to low
+similarity.
+
 ```python
 top_n_matches = matches.take_top_n(5)
 
@@ -99,28 +126,38 @@ top_n_percent_matches = matches.take_top_percent(25)
 one_to_one_matches = matches.one_to_one()
 ```
 
-
 ### Measuring effectiveness
-The MatcherResults instance that is returned by `valentine_match` or `valentine_match_batch` also has a `get_metrics` method that the user can use 
+
+The MatcherResults instance that is returned by `valentine_match` or `valentine_match_batch` also has a `get_metrics`
+method that the user can use
 
 ```python 
 metrics = matches.get_metrics(ground_truth)
 ``` 
 
-in order to get all effectiveness metrics, such as Precision, Recall, F1-score and others as described in the original Valentine paper. In order to do so, the user needs to also input the ground truth of matches based on which the metrics will be calculated. The ground truth can be given as a list of tuples representing column matches that should hold (see example below).
+in order to get all effectiveness metrics, such as Precision, Recall, F1-score and others as described in the original
+Valentine paper. In order to do so, the user needs to also input the ground truth of matches based on which the metrics
+will be calculated. The ground truth can be given as a list of tuples representing column matches that should hold (see
+example below).
 
-By default, all the core metrics will be used for this with default parameters, but the user can also customize which metrics to run with what parameters, and implement own custom metrics by extending from the `Metric` base class. Some sets of metrics are available as well.
+By default, all the core metrics will be used for this with default parameters, but the user can also customize which
+metrics to run with what parameters, and implement own custom metrics by extending from the `Metric` base class. Some
+sets of metrics are available as well.
 
 ```python
 from valentine.metrics import F1Score, PrecisionTopNPercent, METRICS_PRECISION_INCREASING_N
+
 metrics_custom = matches.get_metrics(ground_truth, metrics={F1Score(one_to_one=False), PrecisionTopNPercent(n=70)})
 metrics_prefefined_set = matches.get_metrics(ground_truth, metrics=METRICS_PRECISION_INCREASING_N)
 
 ```
 
-
 ### Example
-The following block of code shows: 1) how to run a matcher from Valentine on two DataFrames storing information about authors and their publications, and then 2) how to assess its effectiveness based on a given ground truth (a more extensive example is shown in [`valentine_example.py`](https://github.com/delftdata/valentine/blob/master/examples/valentine_example.py)):
+
+The following block of code shows: 1) how to run a matcher from Valentine on two DataFrames storing information about
+authors and their publications, and then 2) how to assess its effectiveness based on a given ground truth (a more
+extensive example is shown
+in [`valentine_example.py`](https://github.com/delftdata/valentine/blob/master/examples/valentine_example.py)):
 
 ```python
 import os
@@ -146,7 +183,7 @@ ground_truth = [('Cited by', 'Cited by'),
                 ('EID', 'EID')]
 
 metrics = matches.get_metrics(ground_truth)
-    
+
 print(metrics)
 ```
 
@@ -168,6 +205,7 @@ The output of the above code block is:
 ```
 
 ## Cite Valentine
+
 ```
 Original Valentine paper:
 @inproceedings{koutras2021valentine,

diff --git a/examples/valentine_example.py b/examples/valentine_example.py
@@ -1,9 +1,12 @@
 import os
+import pprint
+
 import pandas as pd
-from valentine.metrics import F1Score, PrecisionTopNPercent
+
 from valentine import valentine_match
 from valentine.algorithms import JaccardDistanceMatcher
-import pprint
+from valentine.metrics import F1Score, PrecisionTopNPercent
+
 pp = pprint.PrettyPrinter(indent=4, sort_dicts=False)
 
 

diff --git a/setup.py b/setup.py
@@ -1,6 +1,7 @@
-import setuptools
 from pathlib import Path
 
+import setuptools
+
 this_directory = Path(__file__).parent
 long_description = (this_directory / "README.md").read_text()
 

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,4 +1,5 @@
 import os
+
 import pandas as pd
 
 # Load the data for the tests

diff --git a/tests/test_matcher_results.py b/tests/test_matcher_results.py
@@ -1,11 +1,11 @@
-import unittest
 import math
+import unittest
 
 from tests import df1, df2
-from valentine.algorithms.matcher_results import MatcherResults
+from valentine import valentine_match
 from valentine.algorithms import JaccardDistanceMatcher
+from valentine.algorithms.matcher_results import MatcherResults
 from valentine.metrics import Precision
-from valentine import valentine_match
 
 
 class TestMatcherResults(unittest.TestCase):
@@ -55,7 +55,7 @@ def test_one_to_one(self):
         assert m_one_to_one != m_entry_one_to_one
 
         # Verify that all remaining values are above the median
-        median = sorted(list(m_entry.values()), reverse=True)[math.ceil(len(m_entry)/2)]
+        median = sorted(list(m_entry.values()), reverse=True)[math.ceil(len(m_entry) / 2)]
         for k in m_entry_one_to_one:
             assert m_entry_one_to_one[k] >= median
 
@@ -79,8 +79,8 @@ def test_take_top_n(self):
         take_all = self.matches.take_top_n(len(self.matches))
         assert len(take_all) == len(self.matches)
 
-        take_more_than_all = self.matches.take_top_n(len(self.matches)+1)
+        take_more_than_all = self.matches.take_top_n(len(self.matches) + 1)
         assert len(take_more_than_all) == len(self.matches)
 
     def test_copy(self):
-        assert self.matches.get_copy() is not self.matches
+        assert self.matches.get_copy() is not self.matches