Woah. NICE. (#336)

capitalone · Jul 16, 2021 · 3aee648 · 3aee648
1 parent 5116641
commit 3aee648
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 3 deletions.
diff --git a/dataprofiler/profilers/column_profile_compilers.py b/dataprofiler/profilers/column_profile_compilers.py
@@ -325,9 +325,37 @@ def profile(self):
         }
         # TODO: Only works for last profiler. Abstracted for now.
         for _, profiler in self._profiles.items():
-            profile["data_label"] = profiler.data_label
-            profile["statistics"].update(profiler.profile)
+            col_profile = profiler.profile
+            profile["data_label"] = col_profile.pop("data_label")
+            profile["statistics"].update(col_profile)
         return profile
+
+    def diff(self, other, options=None):
+        """
+        Finds the difference between 2 compilers and returns the report
+
+        :param other: profile compiler finding the difference with this one.
+        :type other: ColumnDataLabelerCompiler
+        :param options: options to change results of the difference
+        :type options: dict
+        :return: difference of the profiles
+        :rtype: dict
+        """
+        # Call super for compiler instance check
+        diff_profile = super().diff(other, options)
+        diff_profile["statistics"] = dict()
+
+        # Iterate through profile(s)
+        all_profiles = set(self._profiles.keys()) & set(other._profiles.keys())
+        for key in all_profiles:
+            diff = self._profiles[key].diff(other._profiles[key], options)
+            diff_profile["data_label"] = diff.pop("data_label")
+            diff_profile["statistics"].update(diff)
+
+        if not diff_profile["statistics"]:
+            diff_profile.pop("statistics")
+
+        return diff_profile
 
 
 class UnstructuredCompiler(BaseCompiler):

diff --git a/dataprofiler/profilers/data_labeler_column_profile.py b/dataprofiler/profilers/data_labeler_column_profile.py
@@ -233,6 +233,7 @@ def profile(self):
         Property for profile. Returns the profile of the column.
         """
         profile = {
+            "data_label": self.data_label,
             "avg_predictions": self.avg_predictions,
             "data_label_representation": self.label_representation,
             "times": self.times

diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py
@@ -269,6 +269,79 @@ def test_compiler_stats_diff(self):
         expected_diff = {}
         self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
 
+    @mock.patch(
+        'dataprofiler.profilers.data_labeler_column_profile.DataLabeler')
+    @mock.patch("dataprofiler.profilers.data_labeler_column_profile."
+               "DataLabelerColumn.update")
+    def test_compiler_data_labeler_diff(self, *mocked_datalabeler):
+        # Initialize dummy data
+        data = pd.Series([])
+
+        # Test normal diff
+        compiler1 = col_pro_compilers.ColumnDataLabelerCompiler(data)
+        compiler2 = col_pro_compilers.ColumnDataLabelerCompiler(data)
+
+        # Mock out the data_label, avg_predictions, and label_representation
+        # properties
+        with mock.patch("dataprofiler.profilers.data_labeler_column_profile"
+                        ".DataLabelerColumn.data_label"), \
+             mock.patch("dataprofiler.profilers.data_labeler_column_profile."
+                        "DataLabelerColumn.avg_predictions"), \
+             mock.patch("dataprofiler.profilers.data_labeler_column_profile."
+                        "DataLabelerColumn.label_representation"):
+            compiler1._profiles["data_labeler"].data_label = "a"
+            compiler1._profiles["data_labeler"].avg_predictions = {
+                "a": 0.25,
+                "b": 0.0,
+                "c": 0.75
+            }
+            compiler1._profiles["data_labeler"].label_representation = {
+                "a": 0.15,
+                "b": 0.01,
+                "c": 0.84
+            }
+
+            compiler2._profiles["data_labeler"].data_label = "b"
+            compiler2._profiles["data_labeler"].avg_predictions = {
+                "a": 0.25,
+                "b": 0.70,
+                "c": 0.05
+            }
+            compiler2._profiles["data_labeler"].label_representation = {
+                "a": 0.99,
+                "b": 0.01,
+                "c": 0.0
+            }
+
+            expected_diff = {
+                'statistics': {
+                    'avg_predictions': {
+                        'a': 'unchanged',
+                        'b': -0.7,
+                        'c': 0.7
+                    },
+                    'label_representation': {
+                        'a': -0.84, 
+                        'b': 'unchanged',
+                        'c': 0.84
+                    }
+                },
+                'data_label': [['a'], [], ['b']]
+            }
+            self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
+
+        # Test disabling one datalabeler profile for compiler diff
+        options = StructuredOptions()
+        options.data_labeler.is_enabled = False
+        compiler1 = col_pro_compilers.ColumnDataLabelerCompiler(data, options)
+        expected_diff = {}
+        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
+
+        # Test disabling both datalabeler profiles for compiler diff
+        compiler2 = col_pro_compilers.ColumnDataLabelerCompiler(data, options)
+        expected_diff = {}
+        self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
+
     @mock.patch.multiple(
         col_pro_compilers.BaseCompiler, __abstractmethods__=set())
     def test_no_profilers_error(self):

diff --git a/dataprofiler/tests/profilers/test_data_labeler_column_profile.py b/dataprofiler/tests/profilers/test_data_labeler_column_profile.py
@@ -51,10 +51,11 @@ def test_base_case(self, mock_instance):
             self.assertEqual(None, profiler.avg_predictions)
             six.assertCountEqual(
                 self,
-                ["avg_predictions", "data_label_representation", "times"],
+                ["data_label", "avg_predictions", "data_label_representation", "times"],
                 list(profiler.profile.keys())
             )
             self.assertEqual({
+                    "data_label": None,
                     "avg_predictions": None,
                     "data_label_representation": None,
                     "times": defaultdict()
@@ -140,6 +141,7 @@ def test_profile(self, mock_instance):
         profiler = DataLabelerColumn(data.name)
 
         expected_profile = {
+            "data_label": 'a',
             "avg_predictions": dict(a=2/3, b=1/3),
             "data_label_representation": dict(a=2/3, b=1/3),
             "times": defaultdict(float, {'data_labeler_predict': 1.0})
@@ -187,6 +189,7 @@ def test_profile_merge(self, mock_instance):
         data2 = pd.Series(['4', '5', '6', '7', '9', '10', '12'])
 
         expected_profile = {
+            "data_label": "a|b",
             "avg_predictions": dict(a=54 / 99, b=45 / 99),
             "data_label_representation": dict(a=54 / 99, b=45 / 99),
             "times": defaultdict(float, {'data_labeler_predict': 2.0})
@@ -229,6 +232,7 @@ def test_profile_merge(self, mock_instance):
             # Check adding even more profiles together
             profiler3 = profiler + profiler3
             expected_profile = {
+                "data_label": "a|b",
                 "avg_predictions": dict(a=8 / 15, b=7 / 15),
                 "data_label_representation": dict(a=8 / 15, b=7 / 15),
                 "times": defaultdict(float, {'data_labeler_predict': 3.0})