From ed297b42a5295577d266aa64d8f0813fa6e1bda2 Mon Sep 17 00:00:00 2001 From: Kelly Peterson Date: Thu, 8 Jul 2021 14:07:55 -0600 Subject: [PATCH] =?UTF-8?q?Augmenting=20time=20series=20which=20have=20add?= =?UTF-8?q?itional=20useful=20columns=20besides=20'=E2=80=A6=20(#35)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Augmenting time series which have additional useful columns besides 'alarm' such as 'upperbound' so that we can access these. Also added a few tests. All tests still pass. * Removing parameter for passing back more columns besides 'alarm' which are available and useful (e.g. 'upperbound'). Not yet complete since some tests are not passing. * Fixing some testing behavior so that when present, "upperbound" will not cause failures with test expectations. * Fixing documentation and typos in slot value extraction Fixing documentation and typos in slot value extraction Co-authored-by: RĂ¼diger Busche * Handling a merge conflict so that the current master change will still return np.ndarray even if the signature and behavior has changed to accomodate any slot name and not simply 'alarm'. * Fixed formatting via black. * Fixed import order via isort. Co-authored-by: RĂ¼diger Busche --- epysurv/models/timepoint/_base.py | 34 ++++++++++++++++++++------- epysurv/models/timeseries/_base.py | 16 ++++++++++--- tests/test_simulation_utils.py | 3 ++- tests/test_timepoint_models.py | 20 ++++++++++++++-- tests/test_timeseries_models.py | 37 +++++++++++++++++++++++++++++- tests/utils.py | 6 +++++ 6 files changed, 101 insertions(+), 15 deletions(-) diff --git a/epysurv/models/timepoint/_base.py b/epysurv/models/timepoint/_base.py index cd7efce..cdab815 100644 --- a/epysurv/models/timepoint/_base.py +++ b/epysurv/models/timepoint/_base.py @@ -114,7 +114,7 @@ def predict(self, data: pd.DataFrame) -> pd.DataFrame: Returns ------- - Original dataframe with "alarm" column added. + Original dataframe with "alarm" column and other relevant columns as available (e.g. "upperbound") added. """ super().predict(data) # Concat training and prediction data. Make index array for range param. @@ -129,7 +129,25 @@ def predict(self, data: pd.DataFrame) -> pd.DataFrame: np.where(full_data.provenance == "test")[0] + 1 ) surveillance_result = self._call_surveillance_algo(r_instance, detection_range) - return data.assign(alarm=self._extract_alarms(surveillance_result).astype(bool)) + data = data.assign( + alarm=self._extract_slot(surveillance_result, "alarm").astype(bool) + ) + + # Let's check what other slots were returned + slot_keys = set() + if hasattr(surveillance_result, "slotnames"): + slot_keys = set(surveillance_result.slotnames()) + elif hasattr(surveillance_result, "names"): + slot_keys = set(surveillance_result.names) + + if "upperbound" in slot_keys: + data = data.assign( + upperbound=self._extract_slot(surveillance_result, "upperbound").astype( + float + ) + ) + + return data def _None_to_NULL(self, obj): # NOQA return robjects.NULL if obj is None else obj @@ -138,8 +156,8 @@ def _prepare_r_instance(self, data: pd.DataFrame): """Transform dataframe into R data structure on which the R algorithm can work.""" raise NotImplementedError - def _extract_alarms(self, surveillance_result) -> np.ndarray: - """Extract the binary alarm array from the surveillance result R data structure.""" + def _extract_slot(self, surveillance_result, slot_name) -> np.ndarray: + """Extract the array for the requested slot name from the surveillance result R data structure.""" raise NotImplementedError def _call_surveillance_algo(self, sts, detection_range) -> pd.DataFrame: @@ -173,8 +191,8 @@ def _prepare_r_instance(self, data: pd.DataFrame): ) return sts - def _extract_alarms(self, surveillance_result): - return np.asarray(surveillance_result.slots["alarm"]) + def _extract_slot(self, surveillance_result, slot_name): + return np.asarray(surveillance_result.slots[slot_name]) class DisProgBasedAlgorithm(STSBasedAlgorithm): @@ -184,7 +202,7 @@ def _prepare_r_instance(self, data: pd.DataFrame): sts = super()._prepare_r_instance(data) return surveillance.sts2disProg(sts) - def _extract_alarms(self, surveillance_result): + def _extract_slot(self, surveillance_result, slot_name): return np.asarray( - dict(zip(surveillance_result.names, list(surveillance_result)))["alarm"] + dict(zip(surveillance_result.names, list(surveillance_result)))[slot_name] ) diff --git a/epysurv/models/timeseries/_base.py b/epysurv/models/timeseries/_base.py index 4ad552e..a06dcfb 100644 --- a/epysurv/models/timeseries/_base.py +++ b/epysurv/models/timeseries/_base.py @@ -9,6 +9,7 @@ def fit(self, data_generator): def predict(self, data_generator) -> pd.DataFrame: alarms = [] + upperbounds = [] times = [] for x, _ in data_generator: # Fit on all data, except the last point, that is to be predicted. @@ -19,8 +20,17 @@ def predict(self, data_generator) -> pd.DataFrame: # As only a single value should be returned, we can access this single item. [alarm] = prediction.alarm [time] = prediction.index + + # Check if "upperbound" is available and add if available + if hasattr(prediction, "upperbound"): + [upperbound] = prediction.upperbound + upperbounds.append(upperbound) + alarms.append(alarm) times.append(time) - return pd.DataFrame( - {"alarm": alarms}, index=pd.DatetimeIndex(times, freq="infer") - ) + + frame_dict = {"alarm": alarms} + if len(upperbounds) > 0: + frame_dict["upperbound"] = upperbounds + + return pd.DataFrame(frame_dict, index=pd.DatetimeIndex(times, freq="infer")) diff --git a/tests/test_simulation_utils.py b/tests/test_simulation_utils.py index 1295ea9..7de1897 100644 --- a/tests/test_simulation_utils.py +++ b/tests/test_simulation_utils.py @@ -1,7 +1,8 @@ import pandas as pd -from epysurv.simulation.utils import add_date_time_index_to_frame, r_list_to_frame from rpy2 import robjects +from epysurv.simulation.utils import add_date_time_index_to_frame, r_list_to_frame + def test_add_date_time_index_to_frame(): df = add_date_time_index_to_frame(pd.DataFrame({"a": [1, 2, 3]})) diff --git a/tests/test_timepoint_models.py b/tests/test_timepoint_models.py index efc1064..16da9cc 100644 --- a/tests/test_timepoint_models.py +++ b/tests/test_timepoint_models.py @@ -19,7 +19,7 @@ OutbreakP, ) -from .utils import load_predictions +from .utils import drop_column_if_exists, load_predictions algos_to_test = [ EarsC1, @@ -43,7 +43,13 @@ def test_prediction(train_data, test_data, shared_datadir, Algo): model = Algo() model.fit(train_data) pred = model.predict(test_data) + saved_predictions = load_predictions(shared_datadir / f"{Algo.__name__}_pred.csv") + + # 'upperbound' does not make sense to check for equality, so let's remove it if it exists + pred = drop_column_if_exists(pred, "upperbound") + saved_predictions = drop_column_if_exists(saved_predictions, "upperbound") + assert_frame_equal(pred, saved_predictions) @@ -62,6 +68,11 @@ def test_long_prediction(train_data, test_data, shared_datadir, Algo): model.fit(train_data) pred = model.predict(test_data) saved_predictions = load_predictions(shared_datadir / f"{Algo.__name__}_pred.csv") + + # 'upperbound' does not make sense to check for equality, so let's remove it if it exists + pred = drop_column_if_exists(pred, "upperbound") + saved_predictions = drop_column_if_exists(saved_predictions, "upperbound") + assert_frame_equal(pred, saved_predictions) @@ -86,7 +97,7 @@ def test_output_format(train_data, test_data): original_train_data = train_data.copy() original_test_data = test_data.copy() prediction = model.fit(train_data).predict(test_data) - assert set(test_data.columns) == (set(prediction.columns) - {"alarm"}) + assert set(test_data.columns) == (set(prediction.columns) - {"alarm", "upperbound"}) def test_validate_data_on_fit(train_data): @@ -103,4 +114,9 @@ def test_prediction_witout_labels(train_data, test_data, shared_datadir, Algo): model.fit(train_data[["n_cases"]]) pred = model.predict(test_data) saved_predictions = load_predictions(shared_datadir / f"{Algo.__name__}_pred.csv") + + # 'upperbound' does not make sense to check for equality, so let's remove it if it exists + pred = drop_column_if_exists(pred, "upperbound") + saved_predictions = drop_column_if_exists(saved_predictions, "upperbound") + assert_frame_equal(pred, saved_predictions) diff --git a/tests/test_timeseries_models.py b/tests/test_timeseries_models.py index 644185c..85df265 100644 --- a/tests/test_timeseries_models.py +++ b/tests/test_timeseries_models.py @@ -1,7 +1,12 @@ import numpy as np import pandas as pd +import pytest -from epysurv.models.timeseries import Farrington, GLRPoisson # type: ignore +from epysurv.models.timeseries import ( # type: ignore + Farrington, + FarringtonFlexible, + GLRPoisson, +) from .utils import load_predictions @@ -17,6 +22,36 @@ def test_farrington_timeseries_prediciton(tsc_generator, shared_datadir): pd.testing.assert_series_equal(pred.alarm, saved_predictions.alarm) +def test_farrington_timeseries_prediction_columns(tsc_generator, shared_datadir): + model = Farrington() + model.fit(tsc_generator.train_gen) + pred = model.predict(tsc_generator.test_gen) + + # check for columns + pred_columns = list(pred.columns.values) + + # this one is always here + assert "alarm" in pred_columns + # but this one should be here if we call predict() with get_alarm_only = False + assert "upperbound" in pred_columns + + +def test_farrington_flexible_timeseries_prediction_columns( + tsc_generator, shared_datadir +): + model = FarringtonFlexible() + model.fit(tsc_generator.train_gen) + pred = model.predict(tsc_generator.test_gen) + + # check for columns + pred_columns = list(pred.columns.values) + + # this one is always here + assert "alarm" in pred_columns + # but this one should be here if we call predict() with get_alarm_only = False + assert "upperbound" in pred_columns + + def test_outbreak_case_subtraction(): def test_gen(): df = pd.DataFrame( diff --git a/tests/utils.py b/tests/utils.py index 89bebf2..3fbe396 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -15,3 +15,9 @@ def load_simulations(filepath): ) freq = pd.infer_freq(simulations.index) return simulations.asfreq(freq) + + +def drop_column_if_exists(df, column): + if column in df.columns: + df = df.drop(columns=column) + return df