fix(datarelations): drop_nan before VIF computation (#62)

jfsantos-ds · web-flow · commit 39bb45ee3393 · 2022-02-04T16:33:41.000Z
* drop_nan before VIF computation

* fix linalg error

* codacy linting

* finish linting

* add test suite for data relations engine

* lint test file

* pytest correct fixture definition

* add encoding to open
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -4,3 +4,5 @@ pytest
 sphinx
 myst-parser
 twine
+pytest
+nbconvert
diff --git a/src/ydata_quality/data_relations/engine.py b/src/ydata_quality/data_relations/engine.py
@@ -3,9 +3,8 @@
 """
 from typing import List, Optional, Tuple
 
+from numpy import argwhere, ones, tril
 from pandas import DataFrame
-from numpy import ones, tril, argwhere
-
 from src.ydata_quality.core.warnings import Priority
 
 from ..core import QualityEngine, QualityWarning
@@ -40,7 +39,7 @@ def dtypes(self):
     def dtypes(self, df_dtypes: Tuple[DataFrame, dict]):
         df, dtypes = df_dtypes
         if not isinstance(dtypes, dict):
-            self._logger.warning("Property 'dtypes' should be a dictionary. Defaulting to all column dtypes inference.")
+            self._logger.debug("Property 'dtypes' should be a dictionary. Defaulting to all column dtypes inference.")
             dtypes = {}
         cols_not_in_df = [col for col in dtypes if col not in df.columns]
         if len(cols_not_in_df) > 0:
@@ -49,7 +48,7 @@ def dtypes(self, df_dtypes: Tuple[DataFrame, dict]):
         wrong_dtypes = [col for col, dtype in dtypes.items() if dtype not in supported_dtypes]
         if len(wrong_dtypes) > 0:
             self._logger.warning(
-                "Columns %s of dtypes where not defined with a supported dtype and will be inferred.", wrong_dtypes)
+                "Columns %s have no valid dtypes. Supported dtypes will be inferred.", wrong_dtypes)
         dtypes = {key: val for key, val in dtypes.items() if key not in cols_not_in_df + wrong_dtypes}
         df_col_set = set(df.columns)
         dtypes_col_set = set(dtypes.keys())
@@ -64,7 +63,7 @@ def dtypes(self, df_dtypes: Tuple[DataFrame, dict]):
     def evaluate(self, df: DataFrame, dtypes: Optional[dict] = None, label: str = None, corr_th: float = 0.8,
                  vif_th: float = 5, p_th: float = 0.05, plot: bool = True, summary: bool = True) -> dict:
         """Runs tests to the validation run results and reports based on found errors.
-        We perform standard normalization of numerical features in order to unbias VIF and partial correlation methods.
+        Standard normalization of numerical features is performed as a preprocessing operation.
         This bias correction produces results equivalent to adding a constant feature to the dataset.
 
         Args:
@@ -74,17 +73,25 @@ def evaluate(self, df: DataFrame, dtypes: Optional[dict] = None, label: str = No
             label (Optional[str]): A string identifying the label feature column
             corr_th (float): Absolute threshold for high correlation detection. Defaults to 0.8.
             vif_th (float): Variance Inflation Factor threshold for numerical independence test.
-                Typically 5-10 is recommended. Defaults to 5.
-            p_th (float): Fraction of the right tail of the chi squared CDF.
-                Defines threshold for categorical independence test. Defaults to 0.05.
+                Typically a minimum of 5-10 is recommended. Defaults to 5.
+            p_th (float): Fraction of the right tail of the chi squared CDF defining threshold for categorical
+                 independence test. Defaults to 0.05.
             plot (bool): Pass True to produce all available graphical outputs, False to suppress all graphical output.
             summary (bool): Print a report containing all the warnings detected during the data quality analysis.
         """
-        assert label in df.columns or not label, "The provided label name does not exist as a column in the dataset"
+        results = {}
+        nan_or_const = df.nunique() < 2  # Constant columns or all nan columns
+        label = None if label in nan_or_const else label
+        self._logger.warning('The columns %s are constant or all NaNs and \
+were dropped from this evaluation.', list(nan_or_const.index[nan_or_const]))
+        df = df.drop(columns=nan_or_const.index[nan_or_const])  # Constant columns or all nan columns are dropped
+        if df.shape[1] < 2:
+            self._logger.warning('There are fewer than 2 columns on the dataset where correlations can be computed. \
+Skipping the DataRelations engine execution.')
+            return results
         self.dtypes = (df, dtypes)  # Consider refactoring QualityEngine dtypes (df as argument of setter)
         df = standard_normalize(df, self.dtypes)
-        results = {}
-        corr_mat, _ = correlation_matrix(df, self.dtypes, True)
+        corr_mat, _ = correlation_matrix(df, self.dtypes, label, True)
         p_corr_mat = partial_correlation_matrix(corr_mat)
         results['Correlations'] = {'Correlation matrix': corr_mat, 'Partial correlation matrix': p_corr_mat}
         if plot:
@@ -96,9 +103,12 @@ def evaluate(self, df: DataFrame, dtypes: Optional[dict] = None, label: str = No
             results['Colliders'] = self._collider_detection(corr_mat, p_corr_mat, corr_th)
         else:
             self._logger.warning('The partial correlation matrix is not computable for this dataset. \
-Skipping potential confounder and collider detection tests.')
+Skipped potential confounder and collider detection tests.')
         if label:
-            results['Feature Importance'] = self._feature_importance(corr_mat, p_corr_mat, label, corr_th)
+            try:
+                results['Feature Importance'] = self._feature_importance(corr_mat, p_corr_mat, label, corr_th)
+            except AssertionError as exception:
+                self._logger.warning(str(exception))
         results['High Collinearity'] = self._high_collinearity_detection(df, self.dtypes, label, vif_th, p_th=p_th)
         self._clean_warnings()
         if summary:
@@ -123,9 +133,9 @@ def _confounder_detection(self, corr_mat: DataFrame, par_corr_mat: DataFrame,
                 QualityWarning(
                     test=QualityWarning.Test.CONFOUNDED_CORRELATIONS, category=QualityWarning.Category.DATA_RELATIONS,
                     priority=Priority.P2, data=confounded_pairs,
-                    description=f"""
-                Found {len(confounded_pairs)} independently correlated variable pairs that disappeared after controling\
-                for the remaining variables. This is an indicator of potential confounder effects in the dataset."""))
+                    description=f"""Found {len(confounded_pairs)} independently correlated variable pairs that \
+disappeared after controling for the remaining variables. This is an indicator of potential confounder effects \
+in the dataset."""))
         return confounded_pairs
 
     def _collider_detection(self, corr_mat: DataFrame, par_corr_mat: DataFrame,
@@ -147,8 +157,8 @@ def _collider_detection(self, corr_mat: DataFrame, par_corr_mat: DataFrame,
                     test=QualityWarning.Test.COLLIDER_CORRELATIONS, category=QualityWarning.category.DATA_RELATIONS,
                     priority=Priority.P2, data=colliding_pairs,
                     description=f"Found {len(colliding_pairs)} independently uncorrelated variable pairs that showed \
-correlation after controling for the remaining variables. \
-This is an indicator of potential colliding bias with other covariates."))
+correlation after controling for the remaining variables. This is an indicator of potential colliding bias with other \
+covariates."))
         return colliding_pairs
 
     @staticmethod
@@ -159,7 +169,8 @@ def _feature_importance(corr_mat: DataFrame, par_corr_mat: DataFrame,
 
         This method returns a summary of all detected important features.
         The summary contains zero, full order partial correlation and a note regarding potential confounding."""
-        assert label in corr_mat.columns, f"The provided label {label} does not exist as a column in the DataFrame."
+        assert label in corr_mat.columns, f"The correlations of the label '{label}', required for the feature \
+importance test, were not computed (this column has less than the minimum of 2 unique values needed)."
         label_corrs = corr_mat.loc[label].drop(label)
         mask = ones(label_corrs.shape, dtype='bool')
         mask[label_corrs.abs() <= corr_th] = False  # Drop pairs with zero order correlation below threshold
@@ -204,7 +215,7 @@ def _high_collinearity_detection(self, df: DataFrame, dtypes: dict, label: str =
                     category=QualityWarning.Category.DATA_RELATIONS, priority=Priority.P2, data=inflated,
                     description=f"""Found {len(inflated)} numerical variables with high Variance Inflation Factor \
 (VIF>{vif_th:.1f}). The variables listed in results are highly collinear with other variables in the dataset. \
-These will make model explainability harder and potentially give way to issues like overfitting.\
+These will make model explainability harder and potentially give way to issues like overfitting. \
 Depending on your end goal you might want to remove the highest VIF variables."""))
         if len(cat_coll_scores) > 0:
             # TODO: Merge warning messages (make one warning for the whole test,
diff --git a/src/ydata_quality/utils/auxiliary.py b/src/ydata_quality/utils/auxiliary.py
@@ -92,9 +92,10 @@ def find_duplicate_columns(df: DataFrame, is_close=False) -> dict:
     return dups
 
 
-def drop_column_list(df: DataFrame, column_list: dict):
+def drop_column_list(df: DataFrame, column_list: dict, label: str = None):
     "Drops from a DataFrame a duplicates mapping of columns to duplicate lists. Works inplace."
     for col, dup_list in column_list.items():
+        dup_list = [col for col in dup_list if col != label]
         if col in df.columns:  # Ensures we will not drop both members of duplicate pairs
             df.drop(columns=dup_list, index=dup_list, inplace=True)
 
diff --git a/src/ydata_quality/utils/correlations.py b/src/ydata_quality/utils/correlations.py
@@ -6,8 +6,7 @@
 from itertools import combinations
 from typing import List, Optional
 
-from pandas import DataFrame, Series, crosstab
-from numpy.linalg import pinv
+from matplotlib.pyplot import figure as pltfigure, show as pltshow
 from numpy import (
     nan,
     fill_diagonal,
@@ -30,11 +29,13 @@
     isnan,
     triu_indices_from,
 )
-from scipy.stats import pearsonr, chi2_contingency
+from numpy.linalg import pinv
+from pandas import DataFrame, Series, crosstab
+from scipy.stats import chi2_contingency, pearsonr
 from scipy.stats.distributions import chi2
-from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
-from seaborn import heatmap, diverging_palette
-from matplotlib.pyplot import show as pltshow, figure as pltfigure
+from seaborn import diverging_palette, heatmap
+from statsmodels.stats.outliers_influence import \
+    variance_inflation_factor as vif
 
 from .auxiliary import drop_column_list, find_duplicate_columns
 
@@ -91,7 +92,8 @@ def unbiased_cramers_v(col1: ndarray, col2: ndarray) -> float:
     phi_sq_hat = npmax([0, phi_sq - ((r_vals - 1) * (k_vals - 1)) / (n_elements - 1)])
     k_hat = k_vals - square(k_vals - 1) / (n_elements - 1)
     r_hat = r_vals - square(r_vals - 1) / (n_elements - 1)
-    return sqrt(phi_sq_hat / npmin([k_hat - 1, r_hat - 1]))  # Note: this is strictly positive
+    den = npmin([k_hat - 1, r_hat - 1])
+    return sqrt(phi_sq_hat / den) if den != 0 else nan  # Note: this is strictly positive
 
 
 def correlation_ratio(col1: ndarray, col2: ndarray) -> float:
@@ -102,6 +104,8 @@ def correlation_ratio(col1: ndarray, col2: ndarray) -> float:
         col1 (ndarray): A categorical column with no null values
         col2 (ndarray): A numerical column with no null values"""
     uniques = unique(col1)
+    if len(uniques) < 2:
+        return nan
     y_x_hat = zeros(len(uniques))
     counts = zeros(len(uniques))
     for count, value in enumerate(uniques):
@@ -116,7 +120,7 @@ def correlation_ratio(col1: ndarray, col2: ndarray) -> float:
 
 
 # pylint: disable=too-many-locals
-def correlation_matrix(df: DataFrame, dtypes: dict, drop_dups: bool = False) -> DataFrame:
+def correlation_matrix(df: DataFrame, dtypes: dict, label: str, drop_dups: bool = False) -> DataFrame:
     """Returns the correlation matrix.
     The methods used for computing correlations are mapped according to the column dtypes of each pair."""
     corr_funcs = {  # Map supported correlation functions
@@ -146,8 +150,8 @@ def correlation_matrix(df: DataFrame, dtypes: dict, drop_dups: bool = False) ->
     if drop_dups:
         # Find duplicate row lists in absolute correlation matrix
         dup_pairs = find_duplicate_columns(corr_mat.abs(), True)
-        drop_column_list(corr_mat, dup_pairs)
-        drop_column_list(p_vals, dup_pairs)
+        drop_column_list(corr_mat, dup_pairs, label)
+        drop_column_list(p_vals, dup_pairs, label)
     return corr_mat, p_vals
 
 
@@ -195,10 +199,14 @@ def vif_collinearity(data: DataFrame, dtypes: dict, label: str = None) -> Series
     if label and label in data.columns:
         data = data.drop(columns=label)
     num_columns = [col for col in data.columns if dtypes[col] == 'numerical']
+    data = data.dropna(subset=num_columns)
     warnings.filterwarnings("ignore", category=RuntimeWarning)
-    vifs = [vif(data[num_columns].values, i) for i in range(len(data[num_columns].columns))]
+    if data.empty:
+        vifs = {}
+    else:
+        vifs = {num_columns[i]: vif(data[num_columns].values, i) for i in range(len(data[num_columns].columns))}
     warnings.resetwarnings()
-    return Series(data=vifs, index=num_columns).sort_values(ascending=False)
+    return Series(data=vifs, dtype=float).sort_values(ascending=False)
 
 
 # pylint: disable=too-many-locals
diff --git a/tests/engines/test_data_relations.py b/tests/engines/test_data_relations.py
@@ -0,0 +1,71 @@
+"Tests for the DataRelations module."
+from pytest import fixture
+from pandas import read_csv
+import nbformat
+from nbconvert.preprocessors import ExecutePreprocessor
+
+from ydata_quality.data_relations.engine import DataRelationsDetector
+
+
+@fixture(name='data_relations')
+def fixture_data_relations():
+    return DataRelationsDetector()
+
+
+@fixture(name='example_dataset_transformed')
+def fixture_example_dataset_transformed():
+    dataset_path = 'datasets/transformed/census_10k.csv'
+    return read_csv(dataset_path)
+
+
+@fixture(name='ipynb_tutorial')
+def fixture_ipynb_tutorial():
+    path = "tutorials/data_relations.ipynb"
+    with open(path, encoding='utf8', errors='strict') as file:
+        ntb = nbformat.read(file, as_version=4)
+    return ntb
+
+
+@fixture(name='dr_results_no_pcorr')
+def fixture_dr_results_no_pcorr(data_relations, example_dataset_transformed):
+    results = data_relations.evaluate(df=example_dataset_transformed,
+                                      dtypes=None,
+                                      label='income',
+                                      plot=False)
+    return data_relations, results
+
+
+@fixture(name='dr_results_pc_corr')
+def fixture_dr_results_pc_corr(data_relations, example_dataset_transformed):
+    df = example_dataset_transformed.drop(columns=['education-num'])
+    results = data_relations.evaluate(df=df,
+                                      dtypes=None,
+                                      label='income',
+                                      plot=False)
+    return data_relations, results
+
+
+def test_get_warnings(dr_results_no_pcorr):
+    new_drd = DataRelationsDetector()
+    assert isinstance(new_drd.get_warnings(), list)
+    assert len(new_drd.get_warnings()) == 0
+
+    ran_data_relations, _ = dr_results_no_pcorr
+    assert isinstance(ran_data_relations.get_warnings(), list)
+    assert len(ran_data_relations.get_warnings()) > 0
+
+
+def test_results(dr_results_no_pcorr, dr_results_pc_corr):
+    _, results = dr_results_no_pcorr
+    assert isinstance(results, dict)
+    assert set(results.keys()) == set(['Correlations', 'Feature Importance', 'High Collinearity'])
+
+    _, results2 = dr_results_pc_corr
+    assert isinstance(results2, dict)
+    assert set(results2.keys()) == set(['Correlations', 'Confounders', 'Colliders',
+                                        'Feature Importance', 'High Collinearity'])
+
+
+def test_tutorial_notebook_execution(ipynb_tutorial):
+    exp = ExecutePreprocessor(timeout=600, kernel_name='python3')
+    assert exp.preprocess(ipynb_tutorial, {'metadata': {'path': "tutorials"}})