bc auroc sampling error calculation

NannyML · Apr 12, 2024 · dd12310 · michael-nml · Apr 12, 2024 · nnansters
1 parent ab7e79d
commit dd12310
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 4 deletions.
diff --git a/nannyml/base.py b/nannyml/base.py
@@ -614,3 +614,35 @@ def _raise_exception_for_negative_values(column: pd.Series):
             "\tLog-based metrics are not supported for negative target values.\n"
             f"\tCheck '{column.name}' at rows {str(negative_item_indices)}."
         )
+
+def common_nan_removal(data: pd.DataFrame, selected_columns: List[str]) -> Tuple[List[pd.Series], bool]:
+    """Remove NaN values from rows of selected columns.
+
+    Parameters
+    ----------
+    data: pd.DataFrame
+        Pandas dataframe containing data.
+    selected_columns: List[str]
+        List containing the strings of column names
+
+    Returns
+    -------
+    col_list:
+        List containing the clean columns specified. Order of columns from selected_columns is
+        preserved.
+    empty:
+        Boolean whether the resulting data are contain any rows (false) or not (true)
+    """
+    # If we want target and it's not available we get None
+    if not set(selected_columns) <= set(data.columns):
+        raise InvalidArgumentsException(
+            f"Selected columns: {selected_columns} not all present in provided data columns {list(data.columns)}"
+        )
+    df = data[selected_columns].dropna(axis=0, how='any', inplace=False).reset_index()
+    empty: bool = False
+    if df.shape[0] == 0:
+        empty = True
+    results = []
+    for el in selected_columns:
+        results.append(df[el])
+    return (results, empty)
diff --git a/nannyml/sampling_error/binary_classification.py b/nannyml/sampling_error/binary_classification.py
@@ -22,6 +22,7 @@
 from sklearn.metrics import average_precision_score
 
 from nannyml.exceptions import InvalidArgumentsException
+from nannyml.base import common_nan_removal
 
 # How many experiments to perform when doing resampling to approximate sampling error.
 N_EXPERIMENTS = 50
@@ -49,14 +50,18 @@ def auroc_sampling_error_components(y_true_reference: pd.Series, y_pred_proba_re
     -------
     (std, fraction): Tuple[np.ndarray, float]
     """
-
-    y_true = y_true_reference.copy().reset_index(drop=True)
-    y_pred_proba = y_pred_proba_reference.copy().reset_index(drop=True)
+    # remove common nans - Better Way? - conform with common_nan_removal API
+    df = pd.DataFrame({
+        'y_true': y_true_reference,
+        'y_pred_proba': y_pred_proba_reference,
+    })
+    [y_true, y_pred_proba], empty = common_nan_removal(df, ['y_true', 'y_pred_proba'])
+    y_true = y_true.to_numpy()
+    y_pred_proba = y_pred_proba.to_numpy()
 
     if np.mean(y_true) > 0.5:
         y_true = abs(np.asarray(y_true) - 1)
         y_pred_proba = 1 - y_pred_proba
-
     sorted_idx = np.argsort(y_pred_proba)
     y_pred_proba = y_pred_proba[sorted_idx]
     y_true = y_true[sorted_idx]