Skip to content

Commit

Permalink
Woah. NICE. (#336)
Browse files Browse the repository at this point in the history
  • Loading branch information
grant-eden committed Jul 16, 2021
1 parent 5116641 commit 3aee648
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 3 deletions.
32 changes: 30 additions & 2 deletions dataprofiler/profilers/column_profile_compilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,9 +325,37 @@ def profile(self):
}
# TODO: Only works for last profiler. Abstracted for now.
for _, profiler in self._profiles.items():
profile["data_label"] = profiler.data_label
profile["statistics"].update(profiler.profile)
col_profile = profiler.profile
profile["data_label"] = col_profile.pop("data_label")
profile["statistics"].update(col_profile)
return profile

def diff(self, other, options=None):
"""
Finds the difference between 2 compilers and returns the report
:param other: profile compiler finding the difference with this one.
:type other: ColumnDataLabelerCompiler
:param options: options to change results of the difference
:type options: dict
:return: difference of the profiles
:rtype: dict
"""
# Call super for compiler instance check
diff_profile = super().diff(other, options)
diff_profile["statistics"] = dict()

# Iterate through profile(s)
all_profiles = set(self._profiles.keys()) & set(other._profiles.keys())
for key in all_profiles:
diff = self._profiles[key].diff(other._profiles[key], options)
diff_profile["data_label"] = diff.pop("data_label")
diff_profile["statistics"].update(diff)

if not diff_profile["statistics"]:
diff_profile.pop("statistics")

return diff_profile


class UnstructuredCompiler(BaseCompiler):
Expand Down
1 change: 1 addition & 0 deletions dataprofiler/profilers/data_labeler_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ def profile(self):
Property for profile. Returns the profile of the column.
"""
profile = {
"data_label": self.data_label,
"avg_predictions": self.avg_predictions,
"data_label_representation": self.label_representation,
"times": self.times
Expand Down
73 changes: 73 additions & 0 deletions dataprofiler/tests/profilers/test_column_profile_compilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,79 @@ def test_compiler_stats_diff(self):
expected_diff = {}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

@mock.patch(
'dataprofiler.profilers.data_labeler_column_profile.DataLabeler')
@mock.patch("dataprofiler.profilers.data_labeler_column_profile."
"DataLabelerColumn.update")
def test_compiler_data_labeler_diff(self, *mocked_datalabeler):
# Initialize dummy data
data = pd.Series([])

# Test normal diff
compiler1 = col_pro_compilers.ColumnDataLabelerCompiler(data)
compiler2 = col_pro_compilers.ColumnDataLabelerCompiler(data)

# Mock out the data_label, avg_predictions, and label_representation
# properties
with mock.patch("dataprofiler.profilers.data_labeler_column_profile"
".DataLabelerColumn.data_label"), \
mock.patch("dataprofiler.profilers.data_labeler_column_profile."
"DataLabelerColumn.avg_predictions"), \
mock.patch("dataprofiler.profilers.data_labeler_column_profile."
"DataLabelerColumn.label_representation"):
compiler1._profiles["data_labeler"].data_label = "a"
compiler1._profiles["data_labeler"].avg_predictions = {
"a": 0.25,
"b": 0.0,
"c": 0.75
}
compiler1._profiles["data_labeler"].label_representation = {
"a": 0.15,
"b": 0.01,
"c": 0.84
}

compiler2._profiles["data_labeler"].data_label = "b"
compiler2._profiles["data_labeler"].avg_predictions = {
"a": 0.25,
"b": 0.70,
"c": 0.05
}
compiler2._profiles["data_labeler"].label_representation = {
"a": 0.99,
"b": 0.01,
"c": 0.0
}

expected_diff = {
'statistics': {
'avg_predictions': {
'a': 'unchanged',
'b': -0.7,
'c': 0.7
},
'label_representation': {
'a': -0.84,
'b': 'unchanged',
'c': 0.84
}
},
'data_label': [['a'], [], ['b']]
}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

# Test disabling one datalabeler profile for compiler diff
options = StructuredOptions()
options.data_labeler.is_enabled = False
compiler1 = col_pro_compilers.ColumnDataLabelerCompiler(data, options)
expected_diff = {}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

# Test disabling both datalabeler profiles for compiler diff
compiler2 = col_pro_compilers.ColumnDataLabelerCompiler(data, options)
expected_diff = {}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

@mock.patch.multiple(
col_pro_compilers.BaseCompiler, __abstractmethods__=set())
def test_no_profilers_error(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,11 @@ def test_base_case(self, mock_instance):
self.assertEqual(None, profiler.avg_predictions)
six.assertCountEqual(
self,
["avg_predictions", "data_label_representation", "times"],
["data_label", "avg_predictions", "data_label_representation", "times"],
list(profiler.profile.keys())
)
self.assertEqual({
"data_label": None,
"avg_predictions": None,
"data_label_representation": None,
"times": defaultdict()
Expand Down Expand Up @@ -140,6 +141,7 @@ def test_profile(self, mock_instance):
profiler = DataLabelerColumn(data.name)

expected_profile = {
"data_label": 'a',
"avg_predictions": dict(a=2/3, b=1/3),
"data_label_representation": dict(a=2/3, b=1/3),
"times": defaultdict(float, {'data_labeler_predict': 1.0})
Expand Down Expand Up @@ -187,6 +189,7 @@ def test_profile_merge(self, mock_instance):
data2 = pd.Series(['4', '5', '6', '7', '9', '10', '12'])

expected_profile = {
"data_label": "a|b",
"avg_predictions": dict(a=54 / 99, b=45 / 99),
"data_label_representation": dict(a=54 / 99, b=45 / 99),
"times": defaultdict(float, {'data_labeler_predict': 2.0})
Expand Down Expand Up @@ -229,6 +232,7 @@ def test_profile_merge(self, mock_instance):
# Check adding even more profiles together
profiler3 = profiler + profiler3
expected_profile = {
"data_label": "a|b",
"avg_predictions": dict(a=8 / 15, b=7 / 15),
"data_label_representation": dict(a=8 / 15, b=7 / 15),
"times": defaultdict(float, {'data_labeler_predict': 3.0})
Expand Down

0 comments on commit 3aee648

Please sign in to comment.