delftdata · kPsarakis · Oct 12, 2023 · Oct 11, 2023 · Oct 11, 2023 · Oct 12, 2023
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,5 @@
 __pycache__/
 dist
 valentine.egg-info
-build
+build
+.vscode/
diff --git a/README.md b/README.md
@@ -52,9 +52,18 @@ In order to do so, the user can choose one of the following 5 matching methods:
      * **threshold1**(*float*) - The threshold for phase 1 of the method, default is 0.15.
      * **threshold2**(*float*) - The threshold for phase 2 of the method, default is 0.15.
 
- 4.  `JaccardLevenMatcher(float: threshold_leven)` is a baseline method that uses Jaccard Similarity between columns to assess their correspondence score, enhanced by Levenshtein Distance
+ 4.  `JaccardDistanceMatcher(float: threshold_dist)` is a baseline method that uses Jaccard Similarity between columns to assess their correspondence score, optionally enhanced by a string similarity measure of choice.
    * **Parameters**: 
-     * **threshold_leven**(*float*) - Levenshtein ratio threshold for deciding whether two instances are same or not, default is 0.8.
+     * **threshold_dist**(*float*) - Acceptance threshold for assessing two strings as equal, default is 0.8.
+
+     * **distance_fun**(*StringDistanceFunction*) - String similarity function used to assess whether two strings are equal. The enumeration class type `StringDistanceFunction` can be imported from `valentine.algorithms.jaccard_distance`. Functions currently supported are:  
+   		  * `StringDistanceFunction.Levenshtein`: [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
+         * `StringDistanceFunction.DamerauLevenshtein`: [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
+         * `StringDistanceFunction.Hamming`: [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance)
+         * `StringDistanceFunction.Jaro`: [Jaro distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
+         * `StringDistanceFunction.JaroWinkler`: [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
+         * `StringDistanceFunction.Exact`: String equality `==`
+
 
  5. `SimilarityFlooding(str: coeff_policy, str: formula)` is the python implementation of the paper [Similarity Flooding: A Versatile Graph Matching Algorithmand its Application to Schema Matching](http://p8090-ilpubs.stanford.edu.tudelft.idm.oclc.org/730/1/2002-1.pdf)
    * **Parameters**: 

diff --git a/requirements.txt b/requirements.txt
@@ -5,7 +5,7 @@ nltk==3.8.1
 anytree==2.9.0
 networkx==3.1
 chardet==5.2.0
-levenshtein==0.22.0
+jellyfish==1.0.1
 PuLP==2.7.0
 pyemd==1.0.0
 # data loading

diff --git a/setup.py b/setup.py
@@ -35,7 +35,7 @@
         'anytree>=2.9,<3.0',
         'networkx>=2.8,<4.0',
         'chardet>=5.2.0,<6.0.0',
-        'levenshtein>=0.22,<1.0',
+        'jellyfish>=0.9,<1.1',
         'PuLP>=2.5,<3.0',
         'pyemd>=1.0.0,<2.0',
         'python-dateutil>=2.8,<3.0',

diff --git a/tests/test_algorithms.py b/tests/test_algorithms.py
@@ -1,8 +1,9 @@
 import unittest
 
 from tests import df1, df2
-from valentine.algorithms import Coma, JaccardLevenMatcher, DistributionBased, SimilarityFlooding, Cupid
+from valentine.algorithms import Coma, JaccardDistanceMatcher, DistributionBased, SimilarityFlooding, Cupid
 from valentine.data_sources import DataframeTable
+from valentine.algorithms.jaccard_distance import StringDistanceFunction
 
 d1 = DataframeTable(df1, name='authors1')
 d2 = DataframeTable(df2, name='authors2')
@@ -40,14 +41,56 @@ def test_distribution_based(self):
         matches_db_matcher = distribution_based_matcher.get_matches(d1, d2)
         assert len(matches_db_matcher) > 0  # Check that it actually produced output
 
+    def test_jaccard(self):
+        # Test the Jaccard matcher with exact string similarity
+        jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.Exact)
+        matches_jd_matcher = jd_matcher.get_matches(d1, d2)
+        assert len(matches_jd_matcher) > 0  # Check that it actually produced output
+
+    def test_jaccard_hamming(self):
+        # Test the Jaccard matcher with Hamming distance
+        jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.Hamming)
+        matches_jd_matcher = jd_matcher.get_matches(d1, d2)
+        assert len(matches_jd_matcher) > 0  # Check that it actually produced output
+        jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, process_num=2, distance_fun=StringDistanceFunction.Hamming)
+        matches_jd_matcher = jd_matcher.get_matches(d1, d2)
+        assert len(matches_jd_matcher) > 0  # Check that it actually produced output
+
     def test_jaccard_levenshtein(self):
-        # Test the Jaccard Levenshtein matcher
-        jl_matcher = JaccardLevenMatcher()
-        matches_jl_matcher = jl_matcher.get_matches(d1, d2)
-        assert len(matches_jl_matcher) > 0  # Check that it actually produced output
-        jl_matcher = JaccardLevenMatcher(threshold_leven=0.5, process_num=2)
-        matches_jl_matcher = jl_matcher.get_matches(d1, d2)
-        assert len(matches_jl_matcher) > 0  # Check that it actually produced output
+        # Test the Jaccard matcher with Levenshtein distance
+        jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.Levenshtein)
+        matches_jd_matcher = jd_matcher.get_matches(d1, d2)
+        assert len(matches_jd_matcher) > 0  # Check that it actually produced output
+        jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, process_num=2, distance_fun=StringDistanceFunction.Levenshtein)
+        matches_jd_matcher = jd_matcher.get_matches(d1, d2)
+        assert len(matches_jd_matcher) > 0  # Check that it actually produced output
+
+    def test_jaccard_damerau_levenshtein(self):
+        # Test the Jaccard matcher with Damerau-Levenshtein distance
+        jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.DamerauLevenshtein)
+        matches_jd_matcher = jd_matcher.get_matches(d1, d2)
+        assert len(matches_jd_matcher) > 0  # Check that it actually produced output
+        jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, process_num=2, distance_fun=StringDistanceFunction.DamerauLevenshtein)
+        matches_jd_matcher = jd_matcher.get_matches(d1, d2)
+        assert len(matches_jd_matcher) > 0  # Check that it actually produced output
+
+    def test_jaccard_jaro_winkler(self):
+        # Test the Jaccard matcher with Jaro-Winkler distance
+        jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.JaroWinkler)
+        matches_jd_matcher = jd_matcher.get_matches(d1, d2)
+        assert len(matches_jd_matcher) > 0  # Check that it actually produced output
+        jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, process_num=2, distance_fun=StringDistanceFunction.JaroWinkler)
+        matches_jd_matcher = jd_matcher.get_matches(d1, d2)
+        assert len(matches_jd_matcher) > 0  # Check that it actually produced output
+
+    def test_jaccard_jaro(self):
+        # Test the Jaccard matcher with Jaro distance
+        jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.Jaro)
+        matches_jd_matcher = jd_matcher.get_matches(d1, d2)
+        assert len(matches_jd_matcher) > 0  # Check that it actually produced output
+        jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, process_num=2, distance_fun=StringDistanceFunction.Jaro)
+        matches_jd_matcher = jd_matcher.get_matches(d1, d2)
+        assert len(matches_jd_matcher) > 0  # Check that it actually produced output
 
     def test_similarity_flooding(self):
         # Test the Similarity flooding matcher

diff --git a/valentine/algorithms/__init__.py b/valentine/algorithms/__init__.py
@@ -2,11 +2,11 @@
 from .coma.coma import Coma
 from .cupid.cupid_model import Cupid
 from .distribution_based.distribution_based import DistributionBased
-from .jaccard_levenshtein.jaccard_leven import JaccardLevenMatcher
+from .jaccard_distance.jaccard_distance import JaccardDistanceMatcher
 from .similarity_flooding.similarity_flooding import SimilarityFlooding
 
 schema_only_algorithms = [SimilarityFlooding.__name__, Cupid.__name__]
-instance_only_algorithms = [DistributionBased.__name__, JaccardLevenMatcher.__name__]
+instance_only_algorithms = [DistributionBased.__name__, JaccardDistanceMatcher.__name__]
 schema_instance_algorithms = [Coma.__name__]
 all_matchers = schema_only_algorithms + instance_only_algorithms + schema_instance_algorithms
 
@@ -18,7 +18,7 @@
     "Coma",
     "Cupid",
     "DistributionBased",
-    "JaccardLevenMatcher",
+    "JaccardDistanceMatcher",
     "SimilarityFlooding",
     "BaseMatcher"
 ]
diff --git a/valentine/algorithms/cupid/linguistic_matching.py b/valentine/algorithms/cupid/linguistic_matching.py
@@ -8,11 +8,11 @@
 from anytree import LevelOrderIter
 from nltk.corpus import stopwords
 from nltk.corpus import wordnet as wn
-from Levenshtein import ratio
+from jellyfish import levenshtein_distance
 
 from . import DATATYPE_COMPATIBILITY_TABLE
 from .schema_element import SchemaElement, Token, TokenTypes
-
+from ...utils.utils import normalize_distance
 
 def snakecase_convert(name):
     s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
@@ -214,7 +214,7 @@ def compute_similarity_wordnet(word1,
 # Higher the better
 def compute_similarity_leven(word1,
                              word2):
-    return ratio(word1, word2)
+    return normalize_distance(levenshtein_distance(word1, word2), word1, word2)
 
 
 # max is 0.5

diff --git a/valentine/algorithms/jaccard_distance/__init__.py b/valentine/algorithms/jaccard_distance/__init__.py
@@ -0,0 +1,9 @@
+from enum import Enum, auto
+
+class StringDistanceFunction(Enum):
+    Levenshtein = auto()
+    DamerauLevenshtein = auto()
+    Jaro = auto()
+    JaroWinkler = auto()
+    Hamming = auto()
+    Exact = auto()
diff --git a/...thms/jaccard_levenshtein/jaccard_leven.py → ...thms/jaccard_distance/jaccard_distance.py b/...thms/jaccard_levenshtein/jaccard_leven.py → ...thms/jaccard_distance/jaccard_distance.py
@@ -2,37 +2,42 @@
 from multiprocessing import get_context
 from typing import Dict, Tuple
 
-from Levenshtein import ratio
+from jellyfish import levenshtein_distance, damerau_levenshtein_distance, \
+                      jaro_similarity, jaro_winkler_similarity, hamming_distance
+
+from ..jaccard_distance import StringDistanceFunction
 
 from ..base_matcher import BaseMatcher
 from ..match import Match
 from ...data_sources.base_table import BaseTable
+from ...utils.utils import normalize_distance
 
-
-class JaccardLevenMatcher(BaseMatcher):
+class JaccardDistanceMatcher(BaseMatcher):
     """
     Class containing the methods for implementing a simple baseline matcher that uses Jaccard Similarity between
-    columns to assess their correspondence score, enhanced by Levenshtein Distance.
+    columns to assess their correspondence score, enhanced by a string distance measure.
 
     Methods
     -------
-    jaccard_leven(list1, list2, threshold, process_pool)
+    jaccard_distance(list1, list2, threshold, process_pool)
 
     """
 
     def __init__(self,
-                 threshold_leven: float = 0.8,
+                 threshold_dist: float = 0.8,
+                 distance_fun: StringDistanceFunction = StringDistanceFunction.Levenshtein,
                  process_num: int = 1):
         """
         Parameters
         ----------
-        threshold_leven : float, optional
-            The Levenshtein ratio between the two column entries (lower ratio, the entries are more different)
+        threshold_dist : float, optional
+            The acceptance threshold for two strings to be considered as equal
         process_num : int, optional
             Te number of processes to spawn
         """
-        self.__threshold_leven = float(threshold_leven)
+        self.__threshold_dist = float(threshold_dist)
         self.__process_num = int(process_num)
+        self.__distance_function = distance_fun
 
     def get_matches(self,
                     source_input: BaseTable,
@@ -43,28 +48,30 @@ def get_matches(self,
         if self.__process_num == 1:
             for combination in self.__get_column_combinations(source_input,
                                                               target_input,
-                                                              self.__threshold_leven,
+                                                              self.__threshold_dist,
                                                               target_id,
-                                                              source_id):
-                matches.update(self.process_jaccard_leven(combination))
+                                                              source_id,
+                                                              self.__distance_function):
+                matches.update(self.process_jaccard_distance(combination))
         else:
             with get_context("spawn").Pool(self.__process_num) as process_pool:
                 matches = {}
-                list_of_matches = process_pool.map(self.process_jaccard_leven,
+                list_of_matches = process_pool.map(self.process_jaccard_distance,
                                                    self.__get_column_combinations(source_input,
                                                                                   target_input,
-                                                                                  self.__threshold_leven,
+                                                                                  self.__threshold_dist,
                                                                                   target_id,
-                                                                                  source_id))
+                                                                                  source_id,
+                                                                                  self.__distance_function))
                 [matches.update(match) for match in list_of_matches]
         matches = {k: v for k, v in matches.items() if v > 0.0}  # Remove the pairs with zero similarity
         return matches
 
-    def process_jaccard_leven(self, tup: tuple):
+    def process_jaccard_distance(self, tup: tuple):
 
-        source_data, target_data, threshold, target_id, target_table_name, target_table_unique_identifier, \
-            target_column_name, target_column_unique_identifier, source_table_name, source_table_unique_identifier, \
-            source_id, source_column_name, source_column_unique_identifier = tup
+        source_data, target_data, threshold, _, target_table_name, _, \
+            target_column_name, _, source_table_name, _, \
+            _, source_column_name, _, distance_function = tup
 
         if len(set(source_data)) < len(set(target_data)):
             set1 = set(source_data)
@@ -73,11 +80,22 @@ def process_jaccard_leven(self, tup: tuple):
             set1 = set(target_data)
             set2 = set(source_data)
 
+        if distance_function == StringDistanceFunction.Exact:
+            threshold = 1.0
         combinations = self.__get_set_combinations(set1, set2, threshold)
 
         intersection_cnt = 0
         for cmb in combinations:
-            intersection_cnt = intersection_cnt + self.__process_lv(cmb)
+            if distance_function in [StringDistanceFunction.Levenshtein, StringDistanceFunction.Exact]:
+                intersection_cnt = intersection_cnt + self.__process_distance(cmb + (levenshtein_distance, True))
+            elif distance_function == StringDistanceFunction.DamerauLevenshtein:
+                intersection_cnt = intersection_cnt + self.__process_distance(cmb + (damerau_levenshtein_distance, True))
+            elif distance_function == StringDistanceFunction.Hamming:
+                intersection_cnt = intersection_cnt + self.__process_distance(cmb + (hamming_distance, True))
+            elif distance_function == StringDistanceFunction.Jaro:
+                intersection_cnt = intersection_cnt + self.__process_distance(cmb + (jaro_similarity, False))
+            elif distance_function == StringDistanceFunction.JaroWinkler:
+                intersection_cnt = intersection_cnt + self.__process_distance(cmb + (jaro_winkler_similarity, False))
 
         union_cnt = len(set1) + len(set2) - intersection_cnt
 
@@ -89,19 +107,20 @@ def process_jaccard_leven(self, tup: tuple):
         return Match(target_table_name, target_column_name,
                      source_table_name, source_column_name,
                      sim).to_dict
-
+    
     @staticmethod
     def __get_column_combinations(source_table: BaseTable,
                                   target_table: BaseTable,
                                   threshold,
                                   target_id,
-                                  source_id):
+                                  source_id,
+                                  distance_function: StringDistanceFunction):
         for source_column, target_column in product(source_table.get_columns(), target_table.get_columns()):
             yield source_column.data, target_column.data, threshold, target_id, \
                   target_table.name, target_table.unique_identifier, \
                   target_column.name, target_column.unique_identifier, \
                   source_table.name, source_table.unique_identifier, source_id, \
-                  source_column.name, source_column.unique_identifier
+                  source_column.name, source_column.unique_identifier, distance_function
 
     @staticmethod
     def __get_set_combinations(set1: set,
@@ -126,9 +145,9 @@ def __get_set_combinations(set1: set,
         """
         for s1 in set1:
             yield str(s1), set2, threshold
-
+    
     @staticmethod
-    def __process_lv(tup: tuple):
+    def __process_distance(tup: tuple):
         """
         Function that check if there exist entry from the second set that has a greater Levenshtein ratio with the
         element from the first set than the given threshold
@@ -143,8 +162,15 @@ def __process_lv(tup: tuple):
         int
             1 if there is such an element 0 if not
         """
-        s1, set2, threshold = tup
+        s1, set2, threshold, distance_function, normalize = tup
+
         for s2 in set2:
-            if ratio(s1, str(s2)) >= threshold:
-                return 1
+            str_s2 = str(s2)
+            dist = distance_function(s1, str_s2)
+            if normalize:
+                if normalize_distance(dist, s1, str_s2) >= threshold:
+                    return 1
+            else:
+                if dist >= threshold:
+                    return 1
         return 0
diff --git a/valentine/algorithms/jaccard_levenshtein/__init__.py b/valentine/algorithms/jaccard_levenshtein/__init__.py
diff --git a/valentine/algorithms/similarity_flooding/similarity_flooding.py b/valentine/algorithms/similarity_flooding/similarity_flooding.py
@@ -1,6 +1,6 @@
 from typing import Dict, Tuple
 
-from Levenshtein import ratio
+from jellyfish import levenshtein_distance
 import math
 
 from .graph import Graph
@@ -9,6 +9,7 @@
 from ..match import Match
 from ..base_matcher import BaseMatcher
 from ...data_sources.base_table import BaseTable
+from ...utils.utils import normalize_distance
 
 
 class SimilarityFlooding(BaseMatcher):
@@ -42,7 +43,7 @@ def __calculate_initial_mapping(self):
                 if n1.name[0:6] == "NodeID" or n2.name[0:6] == "NodeID":
                     self.__initial_map[NodePair(n1, n2)] = 0.0
                 else:
-                    similarity = ratio(n1.name, n2.name)
+                    similarity = normalize_distance(levenshtein_distance(n1.name, n2.name), n1.name, n2.name)
                     self.__initial_map[NodePair(n1, n2)] = similarity
 
     @staticmethod

diff --git a/valentine/utils/utils.py b/valentine/utils/utils.py
@@ -20,6 +20,23 @@ def convert_data_type(string: str):
         return f
     except ValueError:
         return string
+
+
+def normalize_distance(dist: int,
+                       str1: str,
+                       str2: str):
+    """
+    Function that returns a normalized similarity score between two strings given their distance
+
+    Parameters
+    ----------
+    dist : int
+        The distance between the two strings (hamming, levenshtein or damerau levenshtein)
+    str1: str, str2: str
+        The strings that are compared
+    """
+
+    return 1 - dist/max(max(len(str1), len(str2)), 1)
 
 
 def get_project_root():