Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend Jaccard baseline #66

Merged
merged 14 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
__pycache__/
dist
valentine.egg-info
build
build
.vscode/
13 changes: 11 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,18 @@ In order to do so, the user can choose one of the following 5 matching methods:
* **threshold1**(*float*) - The threshold for phase 1 of the method, default is 0.15.
* **threshold2**(*float*) - The threshold for phase 2 of the method, default is 0.15.

4. `JaccardLevenMatcher(float: threshold_leven)` is a baseline method that uses Jaccard Similarity between columns to assess their correspondence score, enhanced by Levenshtein Distance
4. `JaccardDistanceMatcher(float: threshold_dist)` is a baseline method that uses Jaccard Similarity between columns to assess their correspondence score, optionally enhanced by a string similarity measure of choice.
* **Parameters**:
* **threshold_leven**(*float*) - Levenshtein ratio threshold for deciding whether two instances are same or not, default is 0.8.
* **threshold_dist**(*float*) - Acceptance threshold for assessing two strings as equal, default is 0.8.

* **distance_fun**(*StringDistanceFunction*) - String similarity function used to assess whether two strings are equal. The enumeration class type `StringDistanceFunction` can be imported from `valentine.algorithms.jaccard_distance`. Functions currently supported are:
* `StringDistanceFunction.Levenshtein`: [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
* `StringDistanceFunction.DamerauLevenshtein`: [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
* `StringDistanceFunction.Hamming`: [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance)
* `StringDistanceFunction.Jaro`: [Jaro distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
* `StringDistanceFunction.JaroWinkler`: [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
* `StringDistanceFunction.Exact`: String equality `==`


5. `SimilarityFlooding(str: coeff_policy, str: formula)` is the python implementation of the paper [Similarity Flooding: A Versatile Graph Matching Algorithmand its Application to Schema Matching](http://p8090-ilpubs.stanford.edu.tudelft.idm.oclc.org/730/1/2002-1.pdf)
* **Parameters**:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ nltk==3.8.1
anytree==2.9.0
networkx==3.1
chardet==5.2.0
levenshtein==0.22.0
jellyfish==1.0.1
PuLP==2.7.0
pyemd==1.0.0
# data loading
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
'anytree>=2.9,<3.0',
'networkx>=2.8,<4.0',
'chardet>=5.2.0,<6.0.0',
'levenshtein>=0.22,<1.0',
'jellyfish>=0.9,<1.1',
'PuLP>=2.5,<3.0',
'pyemd>=1.0.0,<2.0',
'python-dateutil>=2.8,<3.0',
Expand Down
59 changes: 51 additions & 8 deletions tests/test_algorithms.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import unittest

from tests import df1, df2
from valentine.algorithms import Coma, JaccardLevenMatcher, DistributionBased, SimilarityFlooding, Cupid
from valentine.algorithms import Coma, JaccardDistanceMatcher, DistributionBased, SimilarityFlooding, Cupid
from valentine.data_sources import DataframeTable
from valentine.algorithms.jaccard_distance import StringDistanceFunction

d1 = DataframeTable(df1, name='authors1')
d2 = DataframeTable(df2, name='authors2')
Expand Down Expand Up @@ -40,14 +41,56 @@ def test_distribution_based(self):
matches_db_matcher = distribution_based_matcher.get_matches(d1, d2)
assert len(matches_db_matcher) > 0 # Check that it actually produced output

def test_jaccard(self):
# Test the Jaccard matcher with exact string similarity
jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.Exact)
matches_jd_matcher = jd_matcher.get_matches(d1, d2)
assert len(matches_jd_matcher) > 0 # Check that it actually produced output

def test_jaccard_hamming(self):
# Test the Jaccard matcher with Hamming distance
jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.Hamming)
matches_jd_matcher = jd_matcher.get_matches(d1, d2)
assert len(matches_jd_matcher) > 0 # Check that it actually produced output
jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, process_num=2, distance_fun=StringDistanceFunction.Hamming)
matches_jd_matcher = jd_matcher.get_matches(d1, d2)
assert len(matches_jd_matcher) > 0 # Check that it actually produced output

def test_jaccard_levenshtein(self):
# Test the Jaccard Levenshtein matcher
jl_matcher = JaccardLevenMatcher()
matches_jl_matcher = jl_matcher.get_matches(d1, d2)
assert len(matches_jl_matcher) > 0 # Check that it actually produced output
jl_matcher = JaccardLevenMatcher(threshold_leven=0.5, process_num=2)
matches_jl_matcher = jl_matcher.get_matches(d1, d2)
assert len(matches_jl_matcher) > 0 # Check that it actually produced output
# Test the Jaccard matcher with Levenshtein distance
jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.Levenshtein)
matches_jd_matcher = jd_matcher.get_matches(d1, d2)
assert len(matches_jd_matcher) > 0 # Check that it actually produced output
jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, process_num=2, distance_fun=StringDistanceFunction.Levenshtein)
matches_jd_matcher = jd_matcher.get_matches(d1, d2)
assert len(matches_jd_matcher) > 0 # Check that it actually produced output

def test_jaccard_damerau_levenshtein(self):
# Test the Jaccard matcher with Damerau-Levenshtein distance
jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.DamerauLevenshtein)
matches_jd_matcher = jd_matcher.get_matches(d1, d2)
assert len(matches_jd_matcher) > 0 # Check that it actually produced output
jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, process_num=2, distance_fun=StringDistanceFunction.DamerauLevenshtein)
matches_jd_matcher = jd_matcher.get_matches(d1, d2)
assert len(matches_jd_matcher) > 0 # Check that it actually produced output

def test_jaccard_jaro_winkler(self):
# Test the Jaccard matcher with Jaro-Winkler distance
jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.JaroWinkler)
matches_jd_matcher = jd_matcher.get_matches(d1, d2)
assert len(matches_jd_matcher) > 0 # Check that it actually produced output
jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, process_num=2, distance_fun=StringDistanceFunction.JaroWinkler)
matches_jd_matcher = jd_matcher.get_matches(d1, d2)
assert len(matches_jd_matcher) > 0 # Check that it actually produced output

def test_jaccard_jaro(self):
# Test the Jaccard matcher with Jaro distance
jd_matcher = JaccardDistanceMatcher(distance_fun=StringDistanceFunction.Jaro)
matches_jd_matcher = jd_matcher.get_matches(d1, d2)
assert len(matches_jd_matcher) > 0 # Check that it actually produced output
jd_matcher = JaccardDistanceMatcher(threshold_dist=0.5, process_num=2, distance_fun=StringDistanceFunction.Jaro)
matches_jd_matcher = jd_matcher.get_matches(d1, d2)
assert len(matches_jd_matcher) > 0 # Check that it actually produced output

def test_similarity_flooding(self):
# Test the Similarity flooding matcher
Expand Down
6 changes: 3 additions & 3 deletions valentine/algorithms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
from .coma.coma import Coma
from .cupid.cupid_model import Cupid
from .distribution_based.distribution_based import DistributionBased
from .jaccard_levenshtein.jaccard_leven import JaccardLevenMatcher
from .jaccard_distance.jaccard_distance import JaccardDistanceMatcher
from .similarity_flooding.similarity_flooding import SimilarityFlooding

schema_only_algorithms = [SimilarityFlooding.__name__, Cupid.__name__]
instance_only_algorithms = [DistributionBased.__name__, JaccardLevenMatcher.__name__]
instance_only_algorithms = [DistributionBased.__name__, JaccardDistanceMatcher.__name__]
schema_instance_algorithms = [Coma.__name__]
all_matchers = schema_only_algorithms + instance_only_algorithms + schema_instance_algorithms

Expand All @@ -18,7 +18,7 @@
"Coma",
"Cupid",
"DistributionBased",
"JaccardLevenMatcher",
"JaccardDistanceMatcher",
"SimilarityFlooding",
"BaseMatcher"
]
6 changes: 3 additions & 3 deletions valentine/algorithms/cupid/linguistic_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
from anytree import LevelOrderIter
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from Levenshtein import ratio
from jellyfish import levenshtein_distance

from . import DATATYPE_COMPATIBILITY_TABLE
from .schema_element import SchemaElement, Token, TokenTypes

from ...utils.utils import normalize_distance

def snakecase_convert(name):
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
Expand Down Expand Up @@ -214,7 +214,7 @@ def compute_similarity_wordnet(word1,
# Higher the better
def compute_similarity_leven(word1,
word2):
return ratio(word1, word2)
return normalize_distance(levenshtein_distance(word1, word2), word1, word2)


# max is 0.5
Expand Down
9 changes: 9 additions & 0 deletions valentine/algorithms/jaccard_distance/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from enum import Enum, auto

class StringDistanceFunction(Enum):
Levenshtein = auto()
DamerauLevenshtein = auto()
Jaro = auto()
JaroWinkler = auto()
Hamming = auto()
Exact = auto()
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,42 @@
from multiprocessing import get_context
from typing import Dict, Tuple

from Levenshtein import ratio
from jellyfish import levenshtein_distance, damerau_levenshtein_distance, \
jaro_similarity, jaro_winkler_similarity, hamming_distance

from ..jaccard_distance import StringDistanceFunction

from ..base_matcher import BaseMatcher
from ..match import Match
from ...data_sources.base_table import BaseTable
from ...utils.utils import normalize_distance


class JaccardLevenMatcher(BaseMatcher):
class JaccardDistanceMatcher(BaseMatcher):
"""
Class containing the methods for implementing a simple baseline matcher that uses Jaccard Similarity between
columns to assess their correspondence score, enhanced by Levenshtein Distance.
columns to assess their correspondence score, enhanced by a string distance measure.

Methods
-------
jaccard_leven(list1, list2, threshold, process_pool)
jaccard_distance(list1, list2, threshold, process_pool)

"""

def __init__(self,
threshold_leven: float = 0.8,
threshold_dist: float = 0.8,
distance_fun: StringDistanceFunction = StringDistanceFunction.Levenshtein,
process_num: int = 1):
"""
Parameters
----------
threshold_leven : float, optional
The Levenshtein ratio between the two column entries (lower ratio, the entries are more different)
threshold_dist : float, optional
The acceptance threshold for two strings to be considered as equal
process_num : int, optional
Te number of processes to spawn
"""
self.__threshold_leven = float(threshold_leven)
self.__threshold_dist = float(threshold_dist)
self.__process_num = int(process_num)
self.__distance_function = distance_fun

def get_matches(self,
source_input: BaseTable,
Expand All @@ -43,28 +48,30 @@ def get_matches(self,
if self.__process_num == 1:
for combination in self.__get_column_combinations(source_input,
target_input,
self.__threshold_leven,
self.__threshold_dist,
target_id,
source_id):
matches.update(self.process_jaccard_leven(combination))
source_id,
self.__distance_function):
matches.update(self.process_jaccard_distance(combination))
else:
with get_context("spawn").Pool(self.__process_num) as process_pool:
matches = {}
list_of_matches = process_pool.map(self.process_jaccard_leven,
list_of_matches = process_pool.map(self.process_jaccard_distance,
self.__get_column_combinations(source_input,
target_input,
self.__threshold_leven,
self.__threshold_dist,
target_id,
source_id))
source_id,
self.__distance_function))
[matches.update(match) for match in list_of_matches]
matches = {k: v for k, v in matches.items() if v > 0.0} # Remove the pairs with zero similarity
return matches

def process_jaccard_leven(self, tup: tuple):
def process_jaccard_distance(self, tup: tuple):

source_data, target_data, threshold, target_id, target_table_name, target_table_unique_identifier, \
target_column_name, target_column_unique_identifier, source_table_name, source_table_unique_identifier, \
source_id, source_column_name, source_column_unique_identifier = tup
source_data, target_data, threshold, _, target_table_name, _, \
target_column_name, _, source_table_name, _, \
_, source_column_name, _, distance_function = tup

if len(set(source_data)) < len(set(target_data)):
set1 = set(source_data)
Expand All @@ -73,11 +80,22 @@ def process_jaccard_leven(self, tup: tuple):
set1 = set(target_data)
set2 = set(source_data)

if distance_function == StringDistanceFunction.Exact:
threshold = 1.0
combinations = self.__get_set_combinations(set1, set2, threshold)

intersection_cnt = 0
for cmb in combinations:
intersection_cnt = intersection_cnt + self.__process_lv(cmb)
if distance_function in [StringDistanceFunction.Levenshtein, StringDistanceFunction.Exact]:
intersection_cnt = intersection_cnt + self.__process_distance(cmb + (levenshtein_distance, True))
elif distance_function == StringDistanceFunction.DamerauLevenshtein:
intersection_cnt = intersection_cnt + self.__process_distance(cmb + (damerau_levenshtein_distance, True))
elif distance_function == StringDistanceFunction.Hamming:
intersection_cnt = intersection_cnt + self.__process_distance(cmb + (hamming_distance, True))
elif distance_function == StringDistanceFunction.Jaro:
intersection_cnt = intersection_cnt + self.__process_distance(cmb + (jaro_similarity, False))
elif distance_function == StringDistanceFunction.JaroWinkler:
intersection_cnt = intersection_cnt + self.__process_distance(cmb + (jaro_winkler_similarity, False))

union_cnt = len(set1) + len(set2) - intersection_cnt

Expand All @@ -89,19 +107,20 @@ def process_jaccard_leven(self, tup: tuple):
return Match(target_table_name, target_column_name,
source_table_name, source_column_name,
sim).to_dict

@staticmethod
def __get_column_combinations(source_table: BaseTable,
target_table: BaseTable,
threshold,
target_id,
source_id):
source_id,
distance_function: StringDistanceFunction):
for source_column, target_column in product(source_table.get_columns(), target_table.get_columns()):
yield source_column.data, target_column.data, threshold, target_id, \
target_table.name, target_table.unique_identifier, \
target_column.name, target_column.unique_identifier, \
source_table.name, source_table.unique_identifier, source_id, \
source_column.name, source_column.unique_identifier
source_column.name, source_column.unique_identifier, distance_function

@staticmethod
def __get_set_combinations(set1: set,
Expand All @@ -126,9 +145,9 @@ def __get_set_combinations(set1: set,
"""
for s1 in set1:
yield str(s1), set2, threshold

@staticmethod
def __process_lv(tup: tuple):
def __process_distance(tup: tuple):
"""
Function that check if there exist entry from the second set that has a greater Levenshtein ratio with the
element from the first set than the given threshold
Expand All @@ -143,8 +162,15 @@ def __process_lv(tup: tuple):
int
1 if there is such an element 0 if not
"""
s1, set2, threshold = tup
s1, set2, threshold, distance_function, normalize = tup

for s2 in set2:
if ratio(s1, str(s2)) >= threshold:
return 1
str_s2 = str(s2)
dist = distance_function(s1, str_s2)
if normalize:
if normalize_distance(dist, s1, str_s2) >= threshold:
return 1
else:
if dist >= threshold:
return 1
return 0
Empty file.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Dict, Tuple

from Levenshtein import ratio
from jellyfish import levenshtein_distance
import math

from .graph import Graph
Expand All @@ -9,6 +9,7 @@
from ..match import Match
from ..base_matcher import BaseMatcher
from ...data_sources.base_table import BaseTable
from ...utils.utils import normalize_distance


class SimilarityFlooding(BaseMatcher):
Expand Down Expand Up @@ -42,7 +43,7 @@ def __calculate_initial_mapping(self):
if n1.name[0:6] == "NodeID" or n2.name[0:6] == "NodeID":
self.__initial_map[NodePair(n1, n2)] = 0.0
else:
similarity = ratio(n1.name, n2.name)
similarity = normalize_distance(levenshtein_distance(n1.name, n2.name), n1.name, n2.name)
self.__initial_map[NodePair(n1, n2)] = similarity

@staticmethod
Expand Down
17 changes: 17 additions & 0 deletions valentine/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,23 @@ def convert_data_type(string: str):
return f
except ValueError:
return string


def normalize_distance(dist: int,
str1: str,
str2: str):
"""
Function that returns a normalized similarity score between two strings given their distance

Parameters
----------
dist : int
The distance between the two strings (hamming, levenshtein or damerau levenshtein)
str1: str, str2: str
The strings that are compared
"""

return 1 - dist/max(max(len(str1), len(str2)), 1)


def get_project_root():
Expand Down
Loading