Fix _list_missing issues + add "run with prediction" tests

NannyML · Apr 25, 2024 · 4eb8cb8 · 4eb8cb8
1 parent cb6e6b3
commit 4eb8cb8
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 2 deletions.
diff --git a/nannyml/performance_estimation/confidence_based/cbpe.py b/nannyml/performance_estimation/confidence_based/cbpe.py
@@ -346,7 +346,10 @@ def _estimate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
             raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.')
 
         if self.problem_type == ProblemType.CLASSIFICATION_BINARY:
-            _list_missing([self.y_pred, self.y_pred_proba], data)
+            required_cols = [self.y_pred_proba]
+            if self.y_pred is not None:
+                required_cols.append(self.y_pred)
+            _list_missing(required_cols, list(data.columns))
 
             # We need uncalibrated data to calculate the realized performance on.
             # https://github.com/NannyML/nannyml/issues/98
@@ -419,7 +422,10 @@ def _fit_binary(self, reference_data: pd.DataFrame) -> CBPE:
         if reference_data.empty:
             raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.')
 
-        _list_missing([self.y_true, self.y_pred_proba, self.y_pred], list(reference_data.columns))
+        required_cols = [self.y_true, self.y_pred_proba]
+        if self.y_pred is not None:
+            required_cols.append(self.y_pred)
+        _list_missing(required_cols, list(reference_data.columns))
 
         # We need uncalibrated data to calculate the realized performance on.
         # We need realized performance in threshold calculations.

diff --git a/tests/performance_calculation/test_performance_calculator.py b/tests/performance_calculation/test_performance_calculator.py
@@ -462,3 +462,17 @@ def test_binary_classification_result_plots_raise_no_exceptions(calc_args, plot_
         _ = sut.plot(**plot_args)
     except Exception as exc:
         pytest.fail(f"an unexpected exception occurred: {exc}")
+
+
+def test_binary_classification_calculate_without_prediction_column():
+    reference, analysis, analysis_targets = load_synthetic_binary_classification_dataset()
+    calc = PerformanceCalculator(
+        y_true='work_home_actual',
+        y_pred_proba='y_pred_proba',
+        problem_type=ProblemType.CLASSIFICATION_BINARY,
+        metrics=['roc_auc', 'average_precision'],
+        timestamp_column_name='timestamp',
+        chunk_period='M'
+    ).fit(reference)
+    res = calc.calculate(analysis.merge(analysis_targets, on='id'))
+
diff --git a/tests/performance_estimation/CBPE/test_cbpe.py b/tests/performance_estimation/CBPE/test_cbpe.py
@@ -702,3 +702,19 @@ def test_cbpe_with_default_thresholds():
     sut = est.thresholds
 
     assert sut == DEFAULT_THRESHOLDS
+
+
+def test_cbpe_without_predictions():
+    ref_df, ana_df, _ = load_synthetic_binary_classification_dataset()
+    cbpe = CBPE(
+        y_pred_proba='y_pred_proba',
+        y_true='work_home_actual',
+        problem_type='classification_binary',
+        metrics=[
+            'roc_auc',
+            'average_precision',
+        ],
+        timestamp_column_name='timestamp',
+        chunk_period='M',
+    ).fit(ref_df)
+    result = cbpe.estimate(ana_df)