Add chi-squared test to categorical diff (#366)

* Add chi2 test to categorical diff * Update tests * Fix bug in tests * Fix another bug in tests * Refactor sum and expected count calculation * Update tests * Refactor naming
capitalone · Jul 29, 2021 · d2aec48 · d2aec48
1 parent 737f0eb
commit d2aec48
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 1 deletion.
diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py
@@ -1,6 +1,9 @@
+import warnings
 from collections import defaultdict
 from operator import itemgetter
 
+import numpy as np
+
 from . import BaseColumnProfiler
 from .profiler_options import CategoricalOptions
 from . import utils
@@ -89,6 +92,10 @@ def diff(self, other_profile, options=None):
 
         # These stats are only diffed if both profiles are categorical
         if self.is_match and other_profile.is_match:
+            differences["chi2-test"] = self._perform_chi_squared_test(
+                self._categories, self.sample_size,
+                other_profile._categories, other_profile.sample_size
+            )
             differences["statistics"]['categories'] = \
                 utils.find_diff_of_lists_and_sets(self.categories,
                                                   other_profile.categories)
@@ -110,6 +117,80 @@ def diff(self, other_profile, options=None):
 
         return differences
 
+    @staticmethod
+    def _perform_chi_squared_test(categories1, sample_size1,
+                                  categories2, sample_size2):
+        """
+        Performs a Chi Squared test for homogeneity between two groups.
+
+        :param categories1: Categories and respective counts of the first group
+        :type categories1: dict
+        :param sample_size1: Number of samples in first group
+        :type sample_size1: int
+        :param categories2: Categories and respective counts of the second group
+        :type categories2: dict
+        :param sample_size2: Number of samples in second group
+        :type sample_size2: int
+        :return: Results of the chi squared test
+        :rtype: dict
+        """
+        results = {
+            "chi2-statistic": None,
+            "df": None,
+            "p-value": None
+        }
+        cat_counts = utils.add_nested_dictionaries(categories1, categories2)
+
+        # If one or less categories, we have zero/negative degrees of freedom, which is not an
+        # appropriate value for this context
+        num_cats = len(cat_counts)
+        if len(cat_counts) <= 1:
+            warnings.warn("Insufficient number of categories. "
+                          "Chi-squared test cannot be performed.", RuntimeWarning)
+            return results
+
+        # Calculate degrees of freedom
+        # df = (rows - 1) * (cols - 1), in the case of two groups reduces to cols - 1
+        df = num_cats - 1
+        results["df"] = df
+
+        total = sample_size1 + sample_size2
+
+        # If a zero is found in either row or col sums, then an expected count will be
+        # zero. This means the chi2-statistic and p-value are infinity and zero,
+        # so calculation can be skipped.
+        if 0 in [sample_size1, sample_size2] or 0 in cat_counts.values():
+            results["chi2-statistic"] = np.inf
+            results["p-value"] = 0
+            return results
+
+        # Calculate chi-sq statistic
+        chi2_statistic = 0
+        for cat, count in cat_counts.items():
+            expected1 = sample_size1 * count / total
+            expected2 = sample_size2 * count / total
+            chi2_statistic += (categories1.get(cat, 0) - expected1) \
+                              ** 2 / expected1
+            chi2_statistic += (categories2.get(cat, 0) - expected2) \
+                              ** 2 / expected2
+        results["chi2-statistic"] = chi2_statistic
+
+        try:
+            import scipy.stats
+        except ImportError:
+            # Failed, so we return the stats but don't perform the test
+            warnings.warn("Could not import necessary statistical packages. "
+                          "To successfully perform the chi-squared test, please run 'pip "
+                          "install scipy.' Test results will be incomplete.",
+                          RuntimeWarning)
+            return results
+
+        # Calculate p-value, i.e. P(X > chi2_statistic)
+        p_value = 1 - scipy.stats.chi2(df).cdf(chi2_statistic)
+        results["p-value"] = p_value
+
+        return results
+
     @property
     def profile(self):
         """

diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py
@@ -317,7 +317,10 @@ def test_categorical_diff(self):
         df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
         profile2 = CategoricalColumn(df_categorical.name)
         profile2.update(df_categorical)
-
+
+        # chi2-statistic = sum((observed-expected)^2/expected for each category in each column)
+        # df = categories - 1
+        # p-value found through using chi2 CDF
         expected_diff = {
             'categorical': 'unchanged',
             'statistics': {
@@ -331,6 +334,11 @@ def test_categorical_diff(self):
                     'n': 1,
                     'maybe': [None, 2]
                 }
+            },
+            'chi2-test': {
+                'chi2-statistic': 82/35,
+                'df': 2,
+                'p-value': 0.3099238764710244
             }
         }
 

diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py
@@ -269,6 +269,11 @@ def test_compiler_stats_diff(self):
                     '1': [1, None], 
                     '10': [None, 1]
                 }
+            },
+            'chi2-test': {
+                'chi2-statistic': 2.1,
+                'df': 2,
+                'p-value': 0.3499377491111554
             }
         }
         self.assertDictEqual(expected_diff, compiler1.diff(compiler2))