From 1303abe04b48fa87c67d8d9b3a13f8cb88e79afb Mon Sep 17 00:00:00 2001 From: az85252 Date: Fri, 16 Jul 2021 15:36:33 -0500 Subject: [PATCH] Unalikeability Revisions (#341) * revised unalikeability functionality * added test cases for revised unalikeability functionality * update to 0.6.1 --- dataprofiler/profilers/categorical_column_profile.py | 2 ++ .../tests/profilers/test_categorical_column_profile.py | 10 ++++++++++ dataprofiler/version.py | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index dad024935..c11f63e6b 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -260,6 +260,8 @@ def unalikeability(self): if self.sample_size == 0: return None + elif self.sample_size == 1: + return 0 unalike_sum = 0 for category in self._categories: unalike_sum += (self.sample_size - self._categories[category]) * \ diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index 7803c6806..1f7ef1e37 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -373,6 +373,16 @@ def test_unalikeability(self): profile.update(df_categorical) self.assertEqual(profile.unalikeability, 2*(10 + 15 + 6)/90) + df_categorical = pd.Series(["a"]) + profile = CategoricalColumn(df_categorical.name) + profile.update(df_categorical) + self.assertEqual(0, profile.unalikeability) + + df_categorical = pd.Series([]) + profile = CategoricalColumn(df_categorical.name) + profile.update(df_categorical) + self.assertEqual(None, profile.unalikeability) + def test_top_k_categories_change(self): # Test if top_k_categories is None options = CategoricalOptions() diff --git a/dataprofiler/version.py b/dataprofiler/version.py index 7fe3b57f8..3eaebb9fa 100644 --- a/dataprofiler/version.py +++ b/dataprofiler/version.py @@ -4,7 +4,7 @@ MAJOR = 0 MINOR = 6 -MICRO = 0 +MICRO = 1 VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO)