diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 35607fc..42f19a2 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.7, 3.8] steps: - uses: actions/checkout@v2 diff --git a/docs/releases.md b/docs/releases.md index d23b139..467c951 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -1,3 +1,6 @@ +v0.2.2 +- Update grouping to include all strings only if identical lists of strings are compared + v0.2.0 - Update naming convention matcher --> model - Update documentation diff --git a/polyfuzz/__init__.py b/polyfuzz/__init__.py index 777bdd9..ac0dc65 100644 --- a/polyfuzz/__init__.py +++ b/polyfuzz/__init__.py @@ -1,2 +1,2 @@ from .polyfuzz import PolyFuzz -__version__ = "0.2.1" +__version__ = "0.2.2" diff --git a/polyfuzz/polyfuzz.py b/polyfuzz/polyfuzz.py index e607b49..f854203 100644 --- a/polyfuzz/polyfuzz.py +++ b/polyfuzz/polyfuzz.py @@ -189,13 +189,17 @@ def visualize_precision_recall(self, def group(self, model: Union[str, BaseMatcher] = None, - link_min_similarity: float = 0.75): + link_min_similarity: float = 0.75, + group_all_strings: bool = False): """ From the matches, group the `To` matches together using single linkage Arguments: model: you can choose one of the models in `polyfuzz.models` to be used as a grouper link_min_similarity: the minimum similarity between strings before they are grouped in a single linkage fashion + group_all_strings: if you want to compare a list of strings with itself and then cluster + those strings, set this to True. Otherwise, only the strings that + were mapped To are clustered. Updates: self.matches: Adds a column `Group` that is the grouped version of the `To` column @@ -223,13 +227,9 @@ def group(self, elif not model: model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity) + # Group per model for name, match in self.matches.items(): - strings = list(self.matches[name].To.dropna().unique()) - matches = model.match(strings, strings) - clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity) - self._map_groups(name, cluster_name_map) - self.clusters[name] = clusters - self.cluster_mappings[name] = cluster_id_map + self._create_groups(name, model, link_min_similarity, group_all_strings) def get_ids(self) -> Union[str, List[str], None]: """ Get all model ids for easier access """ @@ -285,17 +285,33 @@ def get_cluster_mappings(self, name: str = None) -> Mapping[str, int]: return self.cluster_mappings - def _map_groups(self, name: str, cluster_name_map: Mapping[str, str]): - """ Map the 'to' list to groups """ + def _create_groups(self, + name: str, + model: BaseMatcher, + link_min_similarity: float, + group_all_strings: bool): + """ Create groups based on either the To mappings if you compare two different lists of strings, or + the From mappings if you compare lists of strings that are equal (set group_all_strings to True) + """ + + if group_all_strings: + strings = list(self.matches[name].From.dropna().unique()) + else: + strings = list(self.matches[name].To.dropna().unique()) + + # Create clusters + matches = model.match(strings, strings) + clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity) + + # Map the `to` list to groups df = self.matches[name] df["Group"] = df['To'].map(cluster_name_map).fillna(df['To']) - - # Fix that some mappings from "From" end up in "Group" - df.loc[(df.From != df.To) & - (df.From == df.Group), "Group"] = df.loc[(df.From != df.To) & - (df.From == df.Group), "To"] self.matches[name] = df + # Track clusters and their ids + self.clusters[name] = clusters + self.cluster_mappings[name] = cluster_id_map + def _update_model_ids(self): """ Update model ids such that there is no overlap between ids """ # Give models a model_id if it didn't already exist diff --git a/setup.py b/setup.py index 93cb72f..b9fe0fc 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ ] base_packages = [ - "numpy>= 1.18.5", + "numpy>= 1.18.5,<=1.19.4", "scipy>= 1.3.1", "pandas>= 0.25.3", "tqdm>=4.41.1", @@ -25,7 +25,7 @@ ] fast_cosine = ["sparse_dot_topn>=0.2.9"] -embeddings_packages = ["flair>= 0.6.1.post1"] +embeddings_packages = ["torch>=1.2.0", "flair>= 0.7"] extra_packages = embeddings_packages + fast_cosine @@ -37,7 +37,7 @@ setup( name="polyfuzz", packages=find_packages(exclude=["notebooks", "docs"]), - version="0.2.1", + version="0.2.2", author="Maarten Grootendorst", author_email="maartengrootendorst@gmail.com", description="PolyFuzz performs fuzzy string matching, grouping, and evaluation.", diff --git a/tests/test_polyfuzz.py b/tests/test_polyfuzz.py index 5384bfe..15fa920 100644 --- a/tests/test_polyfuzz.py +++ b/tests/test_polyfuzz.py @@ -51,6 +51,20 @@ def test_grouper(method): assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1} +def test_grouper_same_list(): + model = PolyFuzz("TF-IDF").match(from_list, from_list) + model.group(link_min_similarity=0.75, group_all_strings=True) + matches = model.get_matches() + + assert isinstance(matches, pd.DataFrame) + assert matches.Similarity.mean() > 0.3 + assert len(matches) == 6 + assert list(matches.columns) == ['From', 'To', 'Similarity', 'Group'] + + assert model.get_clusters() == {1: ['apples', 'apple', 'appl']} + assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1, 'appl': 1} + + @pytest.mark.parametrize("method", ["Unknown Model"]) def test_wrongbase_model(method): with pytest.raises(ValueError):