Update Missing Values Handling (#378)

* variable code fixes * bc auroc sampling error calculation * refactor auroc sampling error nan handling * auroc realized perf nan handling * realized ap missing value handling * upd bc metrics missing value handling * upd realized perf mc missing value handling * upd realized perf regr missing value handling and fixes * upd dle missing value handling * wip update CBPE missing value handling * update CBPE BC missing value handling * update MC CBPE missing value handling * remove redundant methods/classes * linting updates * linting for DLE * performance calculation linting updates * remove unneeded import * mypy fixes * more mypy fixes * mypy updates * cbpe lingint * sampling error update * ap fix * code fixes wip * nan code updates * Removed some superfluous comments * Remove exception re-raise as it causes the "fallback scenario" to be ignored. * mypy and linting --------- Co-authored-by: Niels Nuyttens <[email protected]>
NannyML · May 7, 2024 · a213cd3 · a213cd3
1 parent 9f09409
commit a213cd3
Show file tree

Hide file tree

Showing 16 changed files with 1,889 additions and 850 deletions.
diff --git a/nannyml/base.py b/nannyml/base.py
@@ -614,3 +614,33 @@ def _raise_exception_for_negative_values(column: pd.Series):
             "\tLog-based metrics are not supported for negative target values.\n"
             f"\tCheck '{column.name}' at rows {str(negative_item_indices)}."
         )
+
+
+def common_nan_removal(data: pd.DataFrame, selected_columns: List[str]) -> Tuple[pd.DataFrame, bool]:
+    """Remove rows of dataframe containing NaN values on selected columns.
+
+    Parameters
+    ----------
+    data: pd.DataFrame
+        Pandas dataframe containing data.
+    selected_columns: List[str]
+        List containing the strings of column names
+
+    Returns
+    -------
+    df:
+        Dataframe with rows containing NaN's on selected_columns removed. All columns of original
+        dataframe are being returned.
+    empty:
+        Boolean whether the resulting data are contain any rows (false) or not (true)
+    """
+    # If we want target and it's not available we get None
+    if not set(selected_columns) <= set(data.columns):
+        raise InvalidArgumentsException(
+            f"Selected columns: {selected_columns} not all present in provided data columns {list(data.columns)}"
+        )
+    df = data.dropna(axis=0, how='any', inplace=False, subset=selected_columns).reset_index(drop=True).infer_objects()
+    empty: bool = False
+    if df.shape[0] == 0:
+        empty = True
+    return (df, empty)
diff --git a/nannyml/drift/multivariate/data_reconstruction/calculator.py b/nannyml/drift/multivariate/data_reconstruction/calculator.py
@@ -14,7 +14,7 @@
 
 """
 
-from typing import List, Optional, Tuple, Union, Dict
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd

diff --git a/nannyml/performance_calculation/calculator.py b/nannyml/performance_calculation/calculator.py
@@ -111,11 +111,9 @@ def __init__(
             When it is not given, only the ROC AUC and Average Precision metrics are supported.
         problem_type: Union[str, ProblemType]
             Determines which method to use. Allowed values are:
-
                 - 'regression'
                 - 'classification_binary'
                 - 'classification_multiclass'
-
         y_pred_proba: ModelOutputsType, default=None
             Name(s) of the column(s) containing your model output.
             Pass a single string when there is only a single model output column, e.g. in binary classification cases.
@@ -124,7 +122,6 @@ def __init__(
         timestamp_column_name: str, default=None
             The name of the column containing the timestamp of the model prediction.
         thresholds: dict
-
             The default values are::
 
                 {
@@ -158,7 +155,7 @@ def __init__(
         chunk_period: str, default=None
             Splits the data according to the given period.
             Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given.
-        chunker : Chunker, default=None
+        chunker: Chunker, default=None
             The `Chunker` used to split the data sets into a lists of chunks.
         normalize_confusion_matrix: str, default=None
             Determines how the confusion matrix will be normalized. Allowed values are None, 'all', 'true' and
@@ -311,7 +308,7 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
         data = data.copy(deep=True)
 
         # Setup for target completeness rate
-        data['NML_TARGET_INCOMPLETE'] = data[self.y_true].isna().astype(np.int16)
+        data[TARGET_COMPLETENESS_RATE_COLUMN_NAME] = data[self.y_true].isna().astype(np.int16)
 
         # Generate chunks
         if self.chunker is None:

diff --git a/nannyml/performance_calculation/metrics/base.py b/nannyml/performance_calculation/metrics/base.py
@@ -1,6 +1,7 @@
 #  Author:   Niels Nuyttens  <[email protected]>
 #
 #  License: Apache Software License 2.0
+"""Base Classes for performane calculation."""
 import abc
 import logging
 from logging import Logger
@@ -134,7 +135,6 @@ def sampling_error(self, data: pd.DataFrame):
 
         Returns
         -------
-
         sampling_error: float
             The expected sampling error.
 
@@ -153,6 +153,7 @@ def alert(self, value: float) -> bool:
         ----------
         value: float
             Value of a calculated metric.
+
         Returns
         -------
         bool: bool
@@ -206,18 +207,22 @@ def get_chunk_record(self, chunk_data: pd.DataFrame) -> Dict:
 
     @property
     def display_name(self) -> str:
+        """Get metric display name."""
         return self.name
 
     @property
     def column_name(self) -> str:
+        """Get metric column name."""
         return self.components[0][1]
 
     @property
     def display_names(self) -> List[str]:
+        """Get metric display names."""
         return [c[0] for c in self.components]
 
     @property
     def column_names(self) -> List[str]:
+        """Get metric column names."""
         return [c[1] for c in self.components]
 
 
@@ -256,6 +261,8 @@ def create(cls, key: str, use_case: ProblemType, **kwargs) -> Metric:
 
     @classmethod
     def register(cls, metric: str, use_case: ProblemType) -> Callable:
+        """Register performance metric class in MetricFactory."""
+
         def inner_wrapper(wrapped_class: Type[Metric]) -> Type[Metric]:
             if metric in cls.registry:
                 if use_case in cls.registry[metric]: