NannyML · nnansters · May 7, 2024 · Apr 12, 2024 · Apr 12, 2024 · Apr 16, 2024
@@ -614,3 +614,34 @@
             "\tLog-based metrics are not supported for negative target values.\n"
             f"\tCheck '{column.name}' at rows {str(negative_item_indices)}."
         )
+
+def common_nan_removal(data: pd.DataFrame, selected_columns: List[str]) -> Tuple[pd.DataFrame, bool]:
+    """Remove rows of dataframe containing NaN values on selected columns.
+
+    Parameters
+    ----------
+    data: pd.DataFrame
+        Pandas dataframe containing data.
+    selected_columns: List[str]
+        List containing the strings of column names
+
+    Returns
+    -------
+    df:
+        Dataframe with rows containing NaN's on selected_columns removed. All columns of original
+        dataframe are being returned.
+    empty:
+        Boolean whether the resulting data are contain any rows (false) or not (true)
+    """
+    # If we want target and it's not available we get None
+    if not set(selected_columns) <= set(data.columns):
+        raise InvalidArgumentsException(
+            f"Selected columns: {selected_columns} not all present in provided data columns {list(data.columns)}"
+        )
+    df = data.dropna(
+        axis=0, how='any', inplace=False, subset=selected_columns
+    ).reset_index(drop=True).infer_objects()
+    empty: bool = False
+    if df.shape[0] == 0:
+        empty = True
+    return (df, empty)
@@ -109,11 +109,9 @@ def __init__(
             The name of the column containing your model predictions.
         problem_type: Union[str, ProblemType]
             Determines which method to use. Allowed values are:
-
                 - 'regression'
                 - 'classification_binary'
                 - 'classification_multiclass'
-
         y_pred_proba: ModelOutputsType, default=None
             Name(s) of the column(s) containing your model output.
             Pass a single string when there is only a single model output column, e.g. in binary classification cases.
@@ -122,7 +120,6 @@ def __init__(
         timestamp_column_name: str, default=None
             The name of the column containing the timestamp of the model prediction.
         thresholds: dict
-
             The default values are::
 
                 {
@@ -156,7 +153,7 @@ def __init__(
         chunk_period: str, default=None
             Splits the data according to the given period.
             Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given.
-        chunker : Chunker, default=None
+        chunker: Chunker, default=None
             The `Chunker` used to split the data sets into a lists of chunks.
         normalize_confusion_matrix: str, default=None
             Determines how the confusion matrix will be normalized. Allowed values are None, 'all', 'true' and
@@ -302,7 +299,7 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
         data = data.copy(deep=True)
 
         # Setup for target completeness rate
-        data['NML_TARGET_INCOMPLETE'] = data[self.y_true].isna().astype(np.int16)
+        data[TARGET_COMPLETENESS_RATE_COLUMN_NAME] = data[self.y_true].isna().astype(np.int16)
 
         # Generate chunks
         if self.chunker is None:

@@ -1,6 +1,7 @@
 #  Author:   Niels Nuyttens  <[email protected]>
 #
 #  License: Apache Software License 2.0
+"""Base Classes for performane calculation."""
 import abc
 import logging
 from logging import Logger
@@ -134,7 +135,6 @@ def sampling_error(self, data: pd.DataFrame):
 
         Returns
         -------
-
         sampling_error: float
             The expected sampling error.
 
@@ -153,6 +153,7 @@ def alert(self, value: float) -> bool:
         ----------
         value: float
             Value of a calculated metric.
+
         Returns
         -------
         bool: bool
@@ -206,18 +207,22 @@ def get_chunk_record(self, chunk_data: pd.DataFrame) -> Dict:
 
     @property
     def display_name(self) -> str:
+        """Get metric display name."""
         return self.name
 
     @property
     def column_name(self) -> str:
+        """Get metric column name."""
         return self.components[0][1]
 
     @property
     def display_names(self) -> List[str]:
+        """Get metric display names."""
         return [c[0] for c in self.components]
 
     @property
     def column_names(self) -> List[str]:
+        """Get metric column names."""
         return [c[1] for c in self.components]
 
 
@@ -256,6 +261,7 @@ def create(cls, key: str, use_case: ProblemType, **kwargs) -> Metric:
 
     @classmethod
     def register(cls, metric: str, use_case: ProblemType) -> Callable:
+        """Register performance metric class in MetricFactory."""
         def inner_wrapper(wrapped_class: Type[Metric]) -> Type[Metric]:
             if metric in cls.registry:
                 if use_case in cls.registry[metric]: