diff --git a/dataprofiler/profilers/column_profile_compilers.py b/dataprofiler/profilers/column_profile_compilers.py index d3f9d96c2..f3511d9ae 100644 --- a/dataprofiler/profilers/column_profile_compilers.py +++ b/dataprofiler/profilers/column_profile_compilers.py @@ -325,9 +325,37 @@ def profile(self): } # TODO: Only works for last profiler. Abstracted for now. for _, profiler in self._profiles.items(): - profile["data_label"] = profiler.data_label - profile["statistics"].update(profiler.profile) + col_profile = profiler.profile + profile["data_label"] = col_profile.pop("data_label") + profile["statistics"].update(col_profile) return profile + + def diff(self, other, options=None): + """ + Finds the difference between 2 compilers and returns the report + + :param other: profile compiler finding the difference with this one. + :type other: ColumnDataLabelerCompiler + :param options: options to change results of the difference + :type options: dict + :return: difference of the profiles + :rtype: dict + """ + # Call super for compiler instance check + diff_profile = super().diff(other, options) + diff_profile["statistics"] = dict() + + # Iterate through profile(s) + all_profiles = set(self._profiles.keys()) & set(other._profiles.keys()) + for key in all_profiles: + diff = self._profiles[key].diff(other._profiles[key], options) + diff_profile["data_label"] = diff.pop("data_label") + diff_profile["statistics"].update(diff) + + if not diff_profile["statistics"]: + diff_profile.pop("statistics") + + return diff_profile class UnstructuredCompiler(BaseCompiler): diff --git a/dataprofiler/profilers/data_labeler_column_profile.py b/dataprofiler/profilers/data_labeler_column_profile.py index a8180889e..55b7369c0 100644 --- a/dataprofiler/profilers/data_labeler_column_profile.py +++ b/dataprofiler/profilers/data_labeler_column_profile.py @@ -233,6 +233,7 @@ def profile(self): Property for profile. Returns the profile of the column. """ profile = { + "data_label": self.data_label, "avg_predictions": self.avg_predictions, "data_label_representation": self.label_representation, "times": self.times diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py index dac6b7f81..529663ef7 100644 --- a/dataprofiler/tests/profilers/test_column_profile_compilers.py +++ b/dataprofiler/tests/profilers/test_column_profile_compilers.py @@ -269,6 +269,79 @@ def test_compiler_stats_diff(self): expected_diff = {} self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) + @mock.patch( + 'dataprofiler.profilers.data_labeler_column_profile.DataLabeler') + @mock.patch("dataprofiler.profilers.data_labeler_column_profile." + "DataLabelerColumn.update") + def test_compiler_data_labeler_diff(self, *mocked_datalabeler): + # Initialize dummy data + data = pd.Series([]) + + # Test normal diff + compiler1 = col_pro_compilers.ColumnDataLabelerCompiler(data) + compiler2 = col_pro_compilers.ColumnDataLabelerCompiler(data) + + # Mock out the data_label, avg_predictions, and label_representation + # properties + with mock.patch("dataprofiler.profilers.data_labeler_column_profile" + ".DataLabelerColumn.data_label"), \ + mock.patch("dataprofiler.profilers.data_labeler_column_profile." + "DataLabelerColumn.avg_predictions"), \ + mock.patch("dataprofiler.profilers.data_labeler_column_profile." + "DataLabelerColumn.label_representation"): + compiler1._profiles["data_labeler"].data_label = "a" + compiler1._profiles["data_labeler"].avg_predictions = { + "a": 0.25, + "b": 0.0, + "c": 0.75 + } + compiler1._profiles["data_labeler"].label_representation = { + "a": 0.15, + "b": 0.01, + "c": 0.84 + } + + compiler2._profiles["data_labeler"].data_label = "b" + compiler2._profiles["data_labeler"].avg_predictions = { + "a": 0.25, + "b": 0.70, + "c": 0.05 + } + compiler2._profiles["data_labeler"].label_representation = { + "a": 0.99, + "b": 0.01, + "c": 0.0 + } + + expected_diff = { + 'statistics': { + 'avg_predictions': { + 'a': 'unchanged', + 'b': -0.7, + 'c': 0.7 + }, + 'label_representation': { + 'a': -0.84, + 'b': 'unchanged', + 'c': 0.84 + } + }, + 'data_label': [['a'], [], ['b']] + } + self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) + + # Test disabling one datalabeler profile for compiler diff + options = StructuredOptions() + options.data_labeler.is_enabled = False + compiler1 = col_pro_compilers.ColumnDataLabelerCompiler(data, options) + expected_diff = {} + self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) + + # Test disabling both datalabeler profiles for compiler diff + compiler2 = col_pro_compilers.ColumnDataLabelerCompiler(data, options) + expected_diff = {} + self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) + @mock.patch.multiple( col_pro_compilers.BaseCompiler, __abstractmethods__=set()) def test_no_profilers_error(self): diff --git a/dataprofiler/tests/profilers/test_data_labeler_column_profile.py b/dataprofiler/tests/profilers/test_data_labeler_column_profile.py index fe29ca9b3..d927c69dd 100644 --- a/dataprofiler/tests/profilers/test_data_labeler_column_profile.py +++ b/dataprofiler/tests/profilers/test_data_labeler_column_profile.py @@ -51,10 +51,11 @@ def test_base_case(self, mock_instance): self.assertEqual(None, profiler.avg_predictions) six.assertCountEqual( self, - ["avg_predictions", "data_label_representation", "times"], + ["data_label", "avg_predictions", "data_label_representation", "times"], list(profiler.profile.keys()) ) self.assertEqual({ + "data_label": None, "avg_predictions": None, "data_label_representation": None, "times": defaultdict() @@ -140,6 +141,7 @@ def test_profile(self, mock_instance): profiler = DataLabelerColumn(data.name) expected_profile = { + "data_label": 'a', "avg_predictions": dict(a=2/3, b=1/3), "data_label_representation": dict(a=2/3, b=1/3), "times": defaultdict(float, {'data_labeler_predict': 1.0}) @@ -187,6 +189,7 @@ def test_profile_merge(self, mock_instance): data2 = pd.Series(['4', '5', '6', '7', '9', '10', '12']) expected_profile = { + "data_label": "a|b", "avg_predictions": dict(a=54 / 99, b=45 / 99), "data_label_representation": dict(a=54 / 99, b=45 / 99), "times": defaultdict(float, {'data_labeler_predict': 2.0}) @@ -229,6 +232,7 @@ def test_profile_merge(self, mock_instance): # Check adding even more profiles together profiler3 = profiler + profiler3 expected_profile = { + "data_label": "a|b", "avg_predictions": dict(a=8 / 15, b=7 / 15), "data_label_representation": dict(a=8 / 15, b=7 / 15), "times": defaultdict(float, {'data_labeler_predict': 3.0})