3
3
"""
4
4
from typing import List , Optional , Tuple
5
5
6
+ from numpy import argwhere , ones , tril
6
7
from pandas import DataFrame
7
- from numpy import ones , tril , argwhere
8
-
9
8
from src .ydata_quality .core .warnings import Priority
10
9
11
10
from ..core import QualityEngine , QualityWarning
@@ -40,7 +39,7 @@ def dtypes(self):
40
39
def dtypes (self , df_dtypes : Tuple [DataFrame , dict ]):
41
40
df , dtypes = df_dtypes
42
41
if not isinstance (dtypes , dict ):
43
- self ._logger .warning ("Property 'dtypes' should be a dictionary. Defaulting to all column dtypes inference." )
42
+ self ._logger .debug ("Property 'dtypes' should be a dictionary. Defaulting to all column dtypes inference." )
44
43
dtypes = {}
45
44
cols_not_in_df = [col for col in dtypes if col not in df .columns ]
46
45
if len (cols_not_in_df ) > 0 :
@@ -49,7 +48,7 @@ def dtypes(self, df_dtypes: Tuple[DataFrame, dict]):
49
48
wrong_dtypes = [col for col , dtype in dtypes .items () if dtype not in supported_dtypes ]
50
49
if len (wrong_dtypes ) > 0 :
51
50
self ._logger .warning (
52
- "Columns %s of dtypes where not defined with a supported dtype and will be inferred." , wrong_dtypes )
51
+ "Columns %s have no valid dtypes. Supported dtypes will be inferred." , wrong_dtypes )
53
52
dtypes = {key : val for key , val in dtypes .items () if key not in cols_not_in_df + wrong_dtypes }
54
53
df_col_set = set (df .columns )
55
54
dtypes_col_set = set (dtypes .keys ())
@@ -64,7 +63,7 @@ def dtypes(self, df_dtypes: Tuple[DataFrame, dict]):
64
63
def evaluate (self , df : DataFrame , dtypes : Optional [dict ] = None , label : str = None , corr_th : float = 0.8 ,
65
64
vif_th : float = 5 , p_th : float = 0.05 , plot : bool = True , summary : bool = True ) -> dict :
66
65
"""Runs tests to the validation run results and reports based on found errors.
67
- We perform standard normalization of numerical features in order to unbias VIF and partial correlation methods .
66
+ Standard normalization of numerical features is performed as a preprocessing operation .
68
67
This bias correction produces results equivalent to adding a constant feature to the dataset.
69
68
70
69
Args:
@@ -74,17 +73,25 @@ def evaluate(self, df: DataFrame, dtypes: Optional[dict] = None, label: str = No
74
73
label (Optional[str]): A string identifying the label feature column
75
74
corr_th (float): Absolute threshold for high correlation detection. Defaults to 0.8.
76
75
vif_th (float): Variance Inflation Factor threshold for numerical independence test.
77
- Typically 5-10 is recommended. Defaults to 5.
78
- p_th (float): Fraction of the right tail of the chi squared CDF.
79
- Defines threshold for categorical independence test. Defaults to 0.05.
76
+ Typically a minimum of 5-10 is recommended. Defaults to 5.
77
+ p_th (float): Fraction of the right tail of the chi squared CDF defining threshold for categorical
78
+ independence test. Defaults to 0.05.
80
79
plot (bool): Pass True to produce all available graphical outputs, False to suppress all graphical output.
81
80
summary (bool): Print a report containing all the warnings detected during the data quality analysis.
82
81
"""
83
- assert label in df .columns or not label , "The provided label name does not exist as a column in the dataset"
82
+ results = {}
83
+ nan_or_const = df .nunique () < 2 # Constant columns or all nan columns
84
+ label = None if label in nan_or_const else label
85
+ self ._logger .warning ('The columns %s are constant or all NaNs and \
86
+ were dropped from this evaluation.' , list (nan_or_const .index [nan_or_const ]))
87
+ df = df .drop (columns = nan_or_const .index [nan_or_const ]) # Constant columns or all nan columns are dropped
88
+ if df .shape [1 ] < 2 :
89
+ self ._logger .warning ('There are fewer than 2 columns on the dataset where correlations can be computed. \
90
+ Skipping the DataRelations engine execution.' )
91
+ return results
84
92
self .dtypes = (df , dtypes ) # Consider refactoring QualityEngine dtypes (df as argument of setter)
85
93
df = standard_normalize (df , self .dtypes )
86
- results = {}
87
- corr_mat , _ = correlation_matrix (df , self .dtypes , True )
94
+ corr_mat , _ = correlation_matrix (df , self .dtypes , label , True )
88
95
p_corr_mat = partial_correlation_matrix (corr_mat )
89
96
results ['Correlations' ] = {'Correlation matrix' : corr_mat , 'Partial correlation matrix' : p_corr_mat }
90
97
if plot :
@@ -96,9 +103,12 @@ def evaluate(self, df: DataFrame, dtypes: Optional[dict] = None, label: str = No
96
103
results ['Colliders' ] = self ._collider_detection (corr_mat , p_corr_mat , corr_th )
97
104
else :
98
105
self ._logger .warning ('The partial correlation matrix is not computable for this dataset. \
99
- Skipping potential confounder and collider detection tests.' )
106
+ Skipped potential confounder and collider detection tests.' )
100
107
if label :
101
- results ['Feature Importance' ] = self ._feature_importance (corr_mat , p_corr_mat , label , corr_th )
108
+ try :
109
+ results ['Feature Importance' ] = self ._feature_importance (corr_mat , p_corr_mat , label , corr_th )
110
+ except AssertionError as exception :
111
+ self ._logger .warning (str (exception ))
102
112
results ['High Collinearity' ] = self ._high_collinearity_detection (df , self .dtypes , label , vif_th , p_th = p_th )
103
113
self ._clean_warnings ()
104
114
if summary :
@@ -123,9 +133,9 @@ def _confounder_detection(self, corr_mat: DataFrame, par_corr_mat: DataFrame,
123
133
QualityWarning (
124
134
test = QualityWarning .Test .CONFOUNDED_CORRELATIONS , category = QualityWarning .Category .DATA_RELATIONS ,
125
135
priority = Priority .P2 , data = confounded_pairs ,
126
- description = f"""
127
- Found { len ( confounded_pairs ) } independently correlated variable pairs that disappeared after controling \
128
- for the remaining variables. This is an indicator of potential confounder effects in the dataset.""" ))
136
+ description = f"""Found { len ( confounded_pairs ) } independently correlated variable pairs that \
137
+ disappeared after controling for the remaining variables. This is an indicator of potential confounder effects \
138
+ in the dataset.""" ))
129
139
return confounded_pairs
130
140
131
141
def _collider_detection (self , corr_mat : DataFrame , par_corr_mat : DataFrame ,
@@ -147,8 +157,8 @@ def _collider_detection(self, corr_mat: DataFrame, par_corr_mat: DataFrame,
147
157
test = QualityWarning .Test .COLLIDER_CORRELATIONS , category = QualityWarning .category .DATA_RELATIONS ,
148
158
priority = Priority .P2 , data = colliding_pairs ,
149
159
description = f"Found { len (colliding_pairs )} independently uncorrelated variable pairs that showed \
150
- correlation after controling for the remaining variables. \
151
- This is an indicator of potential colliding bias with other covariates." ))
160
+ correlation after controling for the remaining variables. This is an indicator of potential colliding bias with other \
161
+ covariates." ))
152
162
return colliding_pairs
153
163
154
164
@staticmethod
@@ -159,7 +169,8 @@ def _feature_importance(corr_mat: DataFrame, par_corr_mat: DataFrame,
159
169
160
170
This method returns a summary of all detected important features.
161
171
The summary contains zero, full order partial correlation and a note regarding potential confounding."""
162
- assert label in corr_mat .columns , f"The provided label { label } does not exist as a column in the DataFrame."
172
+ assert label in corr_mat .columns , f"The correlations of the label '{ label } ', required for the feature \
173
+ importance test, were not computed (this column has less than the minimum of 2 unique values needed)."
163
174
label_corrs = corr_mat .loc [label ].drop (label )
164
175
mask = ones (label_corrs .shape , dtype = 'bool' )
165
176
mask [label_corrs .abs () <= corr_th ] = False # Drop pairs with zero order correlation below threshold
@@ -204,7 +215,7 @@ def _high_collinearity_detection(self, df: DataFrame, dtypes: dict, label: str =
204
215
category = QualityWarning .Category .DATA_RELATIONS , priority = Priority .P2 , data = inflated ,
205
216
description = f"""Found { len (inflated )} numerical variables with high Variance Inflation Factor \
206
217
(VIF>{ vif_th :.1f} ). The variables listed in results are highly collinear with other variables in the dataset. \
207
- These will make model explainability harder and potentially give way to issues like overfitting.\
218
+ These will make model explainability harder and potentially give way to issues like overfitting. \
208
219
Depending on your end goal you might want to remove the highest VIF variables.""" ))
209
220
if len (cat_coll_scores ) > 0 :
210
221
# TODO: Merge warning messages (make one warning for the whole test,
0 commit comments