Skip to content

Commit

Permalink
Update Missing Values Handling (#378)
Browse files Browse the repository at this point in the history
* variable code fixes

* bc auroc sampling error calculation

* refactor auroc sampling error nan handling

* auroc realized perf nan handling

* realized ap missing value handling

* upd bc metrics missing value handling

* upd realized perf mc missing value handling

* upd realized perf regr missing value handling and fixes

* upd dle missing value handling

* wip update CBPE missing value handling

* update CBPE BC missing value handling

* update MC CBPE missing value handling

* remove redundant methods/classes

* linting updates

* linting for DLE

* performance calculation linting updates

* remove unneeded import

* mypy fixes

* more mypy fixes

* mypy updates

* cbpe lingint

* sampling error update

* ap fix

* code fixes wip

* nan code updates

* Removed some superfluous comments

* Remove exception re-raise as it causes the "fallback scenario" to be ignored.

* mypy and linting

---------

Co-authored-by: Niels Nuyttens <[email protected]>
  • Loading branch information
nikml and nnansters committed May 7, 2024
1 parent 9f09409 commit a213cd3
Show file tree
Hide file tree
Showing 16 changed files with 1,889 additions and 850 deletions.
30 changes: 30 additions & 0 deletions nannyml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,3 +614,33 @@ def _raise_exception_for_negative_values(column: pd.Series):
"\tLog-based metrics are not supported for negative target values.\n"
f"\tCheck '{column.name}' at rows {str(negative_item_indices)}."
)


def common_nan_removal(data: pd.DataFrame, selected_columns: List[str]) -> Tuple[pd.DataFrame, bool]:
"""Remove rows of dataframe containing NaN values on selected columns.
Parameters
----------
data: pd.DataFrame
Pandas dataframe containing data.
selected_columns: List[str]
List containing the strings of column names
Returns
-------
df:
Dataframe with rows containing NaN's on selected_columns removed. All columns of original
dataframe are being returned.
empty:
Boolean whether the resulting data are contain any rows (false) or not (true)
"""
# If we want target and it's not available we get None
if not set(selected_columns) <= set(data.columns):
raise InvalidArgumentsException(
f"Selected columns: {selected_columns} not all present in provided data columns {list(data.columns)}"
)
df = data.dropna(axis=0, how='any', inplace=False, subset=selected_columns).reset_index(drop=True).infer_objects()
empty: bool = False
if df.shape[0] == 0:
empty = True
return (df, empty)
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"""

from typing import List, Optional, Tuple, Union, Dict
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
Expand Down
7 changes: 2 additions & 5 deletions nannyml/performance_calculation/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,9 @@ def __init__(
When it is not given, only the ROC AUC and Average Precision metrics are supported.
problem_type: Union[str, ProblemType]
Determines which method to use. Allowed values are:
- 'regression'
- 'classification_binary'
- 'classification_multiclass'
y_pred_proba: ModelOutputsType, default=None
Name(s) of the column(s) containing your model output.
Pass a single string when there is only a single model output column, e.g. in binary classification cases.
Expand All @@ -124,7 +122,6 @@ def __init__(
timestamp_column_name: str, default=None
The name of the column containing the timestamp of the model prediction.
thresholds: dict
The default values are::
{
Expand Down Expand Up @@ -158,7 +155,7 @@ def __init__(
chunk_period: str, default=None
Splits the data according to the given period.
Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given.
chunker : Chunker, default=None
chunker: Chunker, default=None
The `Chunker` used to split the data sets into a lists of chunks.
normalize_confusion_matrix: str, default=None
Determines how the confusion matrix will be normalized. Allowed values are None, 'all', 'true' and
Expand Down Expand Up @@ -311,7 +308,7 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
data = data.copy(deep=True)

# Setup for target completeness rate
data['NML_TARGET_INCOMPLETE'] = data[self.y_true].isna().astype(np.int16)
data[TARGET_COMPLETENESS_RATE_COLUMN_NAME] = data[self.y_true].isna().astype(np.int16)

# Generate chunks
if self.chunker is None:
Expand Down
9 changes: 8 additions & 1 deletion nannyml/performance_calculation/metrics/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Author: Niels Nuyttens <[email protected]>
#
# License: Apache Software License 2.0
"""Base Classes for performane calculation."""
import abc
import logging
from logging import Logger
Expand Down Expand Up @@ -134,7 +135,6 @@ def sampling_error(self, data: pd.DataFrame):
Returns
-------
sampling_error: float
The expected sampling error.
Expand All @@ -153,6 +153,7 @@ def alert(self, value: float) -> bool:
----------
value: float
Value of a calculated metric.
Returns
-------
bool: bool
Expand Down Expand Up @@ -206,18 +207,22 @@ def get_chunk_record(self, chunk_data: pd.DataFrame) -> Dict:

@property
def display_name(self) -> str:
"""Get metric display name."""
return self.name

@property
def column_name(self) -> str:
"""Get metric column name."""
return self.components[0][1]

@property
def display_names(self) -> List[str]:
"""Get metric display names."""
return [c[0] for c in self.components]

@property
def column_names(self) -> List[str]:
"""Get metric column names."""
return [c[1] for c in self.components]


Expand Down Expand Up @@ -256,6 +261,8 @@ def create(cls, key: str, use_case: ProblemType, **kwargs) -> Metric:

@classmethod
def register(cls, metric: str, use_case: ProblemType) -> Callable:
"""Register performance metric class in MetricFactory."""

def inner_wrapper(wrapped_class: Type[Metric]) -> Type[Metric]:
if metric in cls.registry:
if use_case in cls.registry[metric]:
Expand Down
Loading

0 comments on commit a213cd3

Please sign in to comment.