Augmenting time series which have additional useful columns besides '… (

#35) * Augmenting time series which have additional useful columns besides 'alarm' such as 'upperbound' so that we can access these. Also added a few tests. All tests still pass. * Removing parameter for passing back more columns besides 'alarm' which are available and useful (e.g. 'upperbound'). Not yet complete since some tests are not passing. * Fixing some testing behavior so that when present, "upperbound" will not cause failures with test expectations. * Fixing documentation and typos in slot value extraction Fixing documentation and typos in slot value extraction Co-authored-by: Rüdiger Busche <[email protected]> * Handling a merge conflict so that the current master change will still return np.ndarray even if the signature and behavior has changed to accomodate any slot name and not simply 'alarm'. * Fixed formatting via black. * Fixed import order via isort. Co-authored-by: Rüdiger Busche <[email protected]>
JarnoRFB · Jul 8, 2021 · ed297b4 · ed297b4
1 parent 772cb0b
commit ed297b4
Show file tree

Hide file tree

Showing 6 changed files with 101 additions and 15 deletions.
diff --git a/epysurv/models/timepoint/_base.py b/epysurv/models/timepoint/_base.py
@@ -114,7 +114,7 @@ def predict(self, data: pd.DataFrame) -> pd.DataFrame:
 
         Returns
         -------
-            Original dataframe with "alarm" column added.
+            Original dataframe with "alarm" column and other relevant columns as available (e.g. "upperbound") added.
         """
         super().predict(data)
         # Concat training and prediction data. Make index array for range param.
@@ -129,7 +129,25 @@ def predict(self, data: pd.DataFrame) -> pd.DataFrame:
             np.where(full_data.provenance == "test")[0] + 1
         )
         surveillance_result = self._call_surveillance_algo(r_instance, detection_range)
-        return data.assign(alarm=self._extract_alarms(surveillance_result).astype(bool))
+        data = data.assign(
+            alarm=self._extract_slot(surveillance_result, "alarm").astype(bool)
+        )
+
+        # Let's check what other slots were returned
+        slot_keys = set()
+        if hasattr(surveillance_result, "slotnames"):
+            slot_keys = set(surveillance_result.slotnames())
+        elif hasattr(surveillance_result, "names"):
+            slot_keys = set(surveillance_result.names)
+
+        if "upperbound" in slot_keys:
+            data = data.assign(
+                upperbound=self._extract_slot(surveillance_result, "upperbound").astype(
+                    float
+                )
+            )
+
+        return data
 
     def _None_to_NULL(self, obj):  # NOQA
         return robjects.NULL if obj is None else obj
@@ -138,8 +156,8 @@ def _prepare_r_instance(self, data: pd.DataFrame):
         """Transform dataframe into R data structure on which the R algorithm can work."""
         raise NotImplementedError
 
-    def _extract_alarms(self, surveillance_result) -> np.ndarray:
-        """Extract the binary alarm array from the surveillance result R data structure."""
+    def _extract_slot(self, surveillance_result, slot_name) -> np.ndarray:
+        """Extract the array for the requested slot name from the surveillance result R data structure."""
         raise NotImplementedError
 
     def _call_surveillance_algo(self, sts, detection_range) -> pd.DataFrame:
@@ -173,8 +191,8 @@ def _prepare_r_instance(self, data: pd.DataFrame):
         )
         return sts
 
-    def _extract_alarms(self, surveillance_result):
-        return np.asarray(surveillance_result.slots["alarm"])
+    def _extract_slot(self, surveillance_result, slot_name):
+        return np.asarray(surveillance_result.slots[slot_name])
 
 
 class DisProgBasedAlgorithm(STSBasedAlgorithm):
@@ -184,7 +202,7 @@ def _prepare_r_instance(self, data: pd.DataFrame):
         sts = super()._prepare_r_instance(data)
         return surveillance.sts2disProg(sts)
 
-    def _extract_alarms(self, surveillance_result):
+    def _extract_slot(self, surveillance_result, slot_name):
         return np.asarray(
-            dict(zip(surveillance_result.names, list(surveillance_result)))["alarm"]
+            dict(zip(surveillance_result.names, list(surveillance_result)))[slot_name]
         )
diff --git a/epysurv/models/timeseries/_base.py b/epysurv/models/timeseries/_base.py
@@ -9,6 +9,7 @@ def fit(self, data_generator):
 
     def predict(self, data_generator) -> pd.DataFrame:
         alarms = []
+        upperbounds = []
         times = []
         for x, _ in data_generator:
             # Fit on all data, except the last point, that is to be predicted.
@@ -19,8 +20,17 @@ def predict(self, data_generator) -> pd.DataFrame:
             # As only a single value should be returned, we can access this single item.
             [alarm] = prediction.alarm
             [time] = prediction.index
+
+            # Check if "upperbound" is available and add if available
+            if hasattr(prediction, "upperbound"):
+                [upperbound] = prediction.upperbound
+                upperbounds.append(upperbound)
+
             alarms.append(alarm)
             times.append(time)
-        return pd.DataFrame(
-            {"alarm": alarms}, index=pd.DatetimeIndex(times, freq="infer")
-        )
+
+        frame_dict = {"alarm": alarms}
+        if len(upperbounds) > 0:
+            frame_dict["upperbound"] = upperbounds
+
+        return pd.DataFrame(frame_dict, index=pd.DatetimeIndex(times, freq="infer"))
diff --git a/tests/test_simulation_utils.py b/tests/test_simulation_utils.py
@@ -1,7 +1,8 @@
 import pandas as pd
-from epysurv.simulation.utils import add_date_time_index_to_frame, r_list_to_frame
 from rpy2 import robjects
 
+from epysurv.simulation.utils import add_date_time_index_to_frame, r_list_to_frame
+
 
 def test_add_date_time_index_to_frame():
     df = add_date_time_index_to_frame(pd.DataFrame({"a": [1, 2, 3]}))

diff --git a/tests/test_timepoint_models.py b/tests/test_timepoint_models.py
@@ -19,7 +19,7 @@
     OutbreakP,
 )
 
-from .utils import load_predictions
+from .utils import drop_column_if_exists, load_predictions
 
 algos_to_test = [
     EarsC1,
@@ -43,7 +43,13 @@ def test_prediction(train_data, test_data, shared_datadir, Algo):
     model = Algo()
     model.fit(train_data)
     pred = model.predict(test_data)
+
     saved_predictions = load_predictions(shared_datadir / f"{Algo.__name__}_pred.csv")
+
+    # 'upperbound' does not make sense to check for equality, so let's remove it if it exists
+    pred = drop_column_if_exists(pred, "upperbound")
+    saved_predictions = drop_column_if_exists(saved_predictions, "upperbound")
+
     assert_frame_equal(pred, saved_predictions)
 
 
@@ -62,6 +68,11 @@ def test_long_prediction(train_data, test_data, shared_datadir, Algo):
     model.fit(train_data)
     pred = model.predict(test_data)
     saved_predictions = load_predictions(shared_datadir / f"{Algo.__name__}_pred.csv")
+
+    # 'upperbound' does not make sense to check for equality, so let's remove it if it exists
+    pred = drop_column_if_exists(pred, "upperbound")
+    saved_predictions = drop_column_if_exists(saved_predictions, "upperbound")
+
     assert_frame_equal(pred, saved_predictions)
 
 
@@ -86,7 +97,7 @@ def test_output_format(train_data, test_data):
     original_train_data = train_data.copy()
     original_test_data = test_data.copy()
     prediction = model.fit(train_data).predict(test_data)
-    assert set(test_data.columns) == (set(prediction.columns) - {"alarm"})
+    assert set(test_data.columns) == (set(prediction.columns) - {"alarm", "upperbound"})
 
 
 def test_validate_data_on_fit(train_data):
@@ -103,4 +114,9 @@ def test_prediction_witout_labels(train_data, test_data, shared_datadir, Algo):
         model.fit(train_data[["n_cases"]])
     pred = model.predict(test_data)
     saved_predictions = load_predictions(shared_datadir / f"{Algo.__name__}_pred.csv")
+
+    # 'upperbound' does not make sense to check for equality, so let's remove it if it exists
+    pred = drop_column_if_exists(pred, "upperbound")
+    saved_predictions = drop_column_if_exists(saved_predictions, "upperbound")
+
     assert_frame_equal(pred, saved_predictions)
diff --git a/tests/test_timeseries_models.py b/tests/test_timeseries_models.py
@@ -1,7 +1,12 @@
 import numpy as np
 import pandas as pd
+import pytest
 
-from epysurv.models.timeseries import Farrington, GLRPoisson  # type: ignore
+from epysurv.models.timeseries import (  # type: ignore
+    Farrington,
+    FarringtonFlexible,
+    GLRPoisson,
+)
 
 from .utils import load_predictions
 
@@ -17,6 +22,36 @@ def test_farrington_timeseries_prediciton(tsc_generator, shared_datadir):
     pd.testing.assert_series_equal(pred.alarm, saved_predictions.alarm)
 
 
+def test_farrington_timeseries_prediction_columns(tsc_generator, shared_datadir):
+    model = Farrington()
+    model.fit(tsc_generator.train_gen)
+    pred = model.predict(tsc_generator.test_gen)
+
+    # check for columns
+    pred_columns = list(pred.columns.values)
+
+    # this one is always here
+    assert "alarm" in pred_columns
+    # but this one should be here if we call predict() with get_alarm_only = False
+    assert "upperbound" in pred_columns
+
+
+def test_farrington_flexible_timeseries_prediction_columns(
+    tsc_generator, shared_datadir
+):
+    model = FarringtonFlexible()
+    model.fit(tsc_generator.train_gen)
+    pred = model.predict(tsc_generator.test_gen)
+
+    # check for columns
+    pred_columns = list(pred.columns.values)
+
+    # this one is always here
+    assert "alarm" in pred_columns
+    # but this one should be here if we call predict() with get_alarm_only = False
+    assert "upperbound" in pred_columns
+
+
 def test_outbreak_case_subtraction():
     def test_gen():
         df = pd.DataFrame(

diff --git a/tests/utils.py b/tests/utils.py
@@ -15,3 +15,9 @@ def load_simulations(filepath):
     )
     freq = pd.infer_freq(simulations.index)
     return simulations.asfreq(freq)
+
+
+def drop_column_if_exists(df, column):
+    if column in df.columns:
+        df = df.drop(columns=column)
+    return df