Skip to content

Commit

Permalink
Augmenting time series which have additional useful columns besides '… (
Browse files Browse the repository at this point in the history
#35)

* Augmenting time series which have additional useful columns besides 'alarm' such as 'upperbound' so that we can access these.  Also added a few tests.  All tests still pass.

* Removing parameter for passing back more columns besides 'alarm' which are available and useful (e.g. 'upperbound').  Not yet complete since some tests are not passing.

* Fixing some testing behavior so that when present, "upperbound" will not cause failures with test expectations.

* Fixing documentation and typos in slot value extraction

Fixing documentation and typos in slot value extraction

Co-authored-by: Rüdiger Busche <[email protected]>

* Handling a merge conflict so that the current master change will still return np.ndarray even if the signature and behavior has changed to accomodate any slot name and not simply 'alarm'.

* Fixed formatting via black.

* Fixed import order via isort.

Co-authored-by: Rüdiger Busche <[email protected]>
  • Loading branch information
burgersmoke and JarnoRFB authored Jul 8, 2021
1 parent 772cb0b commit ed297b4
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 15 deletions.
34 changes: 26 additions & 8 deletions epysurv/models/timepoint/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def predict(self, data: pd.DataFrame) -> pd.DataFrame:
Returns
-------
Original dataframe with "alarm" column added.
Original dataframe with "alarm" column and other relevant columns as available (e.g. "upperbound") added.
"""
super().predict(data)
# Concat training and prediction data. Make index array for range param.
Expand All @@ -129,7 +129,25 @@ def predict(self, data: pd.DataFrame) -> pd.DataFrame:
np.where(full_data.provenance == "test")[0] + 1
)
surveillance_result = self._call_surveillance_algo(r_instance, detection_range)
return data.assign(alarm=self._extract_alarms(surveillance_result).astype(bool))
data = data.assign(
alarm=self._extract_slot(surveillance_result, "alarm").astype(bool)
)

# Let's check what other slots were returned
slot_keys = set()
if hasattr(surveillance_result, "slotnames"):
slot_keys = set(surveillance_result.slotnames())
elif hasattr(surveillance_result, "names"):
slot_keys = set(surveillance_result.names)

if "upperbound" in slot_keys:
data = data.assign(
upperbound=self._extract_slot(surveillance_result, "upperbound").astype(
float
)
)

return data

def _None_to_NULL(self, obj): # NOQA
return robjects.NULL if obj is None else obj
Expand All @@ -138,8 +156,8 @@ def _prepare_r_instance(self, data: pd.DataFrame):
"""Transform dataframe into R data structure on which the R algorithm can work."""
raise NotImplementedError

def _extract_alarms(self, surveillance_result) -> np.ndarray:
"""Extract the binary alarm array from the surveillance result R data structure."""
def _extract_slot(self, surveillance_result, slot_name) -> np.ndarray:
"""Extract the array for the requested slot name from the surveillance result R data structure."""
raise NotImplementedError

def _call_surveillance_algo(self, sts, detection_range) -> pd.DataFrame:
Expand Down Expand Up @@ -173,8 +191,8 @@ def _prepare_r_instance(self, data: pd.DataFrame):
)
return sts

def _extract_alarms(self, surveillance_result):
return np.asarray(surveillance_result.slots["alarm"])
def _extract_slot(self, surveillance_result, slot_name):
return np.asarray(surveillance_result.slots[slot_name])


class DisProgBasedAlgorithm(STSBasedAlgorithm):
Expand All @@ -184,7 +202,7 @@ def _prepare_r_instance(self, data: pd.DataFrame):
sts = super()._prepare_r_instance(data)
return surveillance.sts2disProg(sts)

def _extract_alarms(self, surveillance_result):
def _extract_slot(self, surveillance_result, slot_name):
return np.asarray(
dict(zip(surveillance_result.names, list(surveillance_result)))["alarm"]
dict(zip(surveillance_result.names, list(surveillance_result)))[slot_name]
)
16 changes: 13 additions & 3 deletions epysurv/models/timeseries/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ def fit(self, data_generator):

def predict(self, data_generator) -> pd.DataFrame:
alarms = []
upperbounds = []
times = []
for x, _ in data_generator:
# Fit on all data, except the last point, that is to be predicted.
Expand All @@ -19,8 +20,17 @@ def predict(self, data_generator) -> pd.DataFrame:
# As only a single value should be returned, we can access this single item.
[alarm] = prediction.alarm
[time] = prediction.index

# Check if "upperbound" is available and add if available
if hasattr(prediction, "upperbound"):
[upperbound] = prediction.upperbound
upperbounds.append(upperbound)

alarms.append(alarm)
times.append(time)
return pd.DataFrame(
{"alarm": alarms}, index=pd.DatetimeIndex(times, freq="infer")
)

frame_dict = {"alarm": alarms}
if len(upperbounds) > 0:
frame_dict["upperbound"] = upperbounds

return pd.DataFrame(frame_dict, index=pd.DatetimeIndex(times, freq="infer"))
3 changes: 2 additions & 1 deletion tests/test_simulation_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pandas as pd
from epysurv.simulation.utils import add_date_time_index_to_frame, r_list_to_frame
from rpy2 import robjects

from epysurv.simulation.utils import add_date_time_index_to_frame, r_list_to_frame


def test_add_date_time_index_to_frame():
df = add_date_time_index_to_frame(pd.DataFrame({"a": [1, 2, 3]}))
Expand Down
20 changes: 18 additions & 2 deletions tests/test_timepoint_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
OutbreakP,
)

from .utils import load_predictions
from .utils import drop_column_if_exists, load_predictions

algos_to_test = [
EarsC1,
Expand All @@ -43,7 +43,13 @@ def test_prediction(train_data, test_data, shared_datadir, Algo):
model = Algo()
model.fit(train_data)
pred = model.predict(test_data)

saved_predictions = load_predictions(shared_datadir / f"{Algo.__name__}_pred.csv")

# 'upperbound' does not make sense to check for equality, so let's remove it if it exists
pred = drop_column_if_exists(pred, "upperbound")
saved_predictions = drop_column_if_exists(saved_predictions, "upperbound")

assert_frame_equal(pred, saved_predictions)


Expand All @@ -62,6 +68,11 @@ def test_long_prediction(train_data, test_data, shared_datadir, Algo):
model.fit(train_data)
pred = model.predict(test_data)
saved_predictions = load_predictions(shared_datadir / f"{Algo.__name__}_pred.csv")

# 'upperbound' does not make sense to check for equality, so let's remove it if it exists
pred = drop_column_if_exists(pred, "upperbound")
saved_predictions = drop_column_if_exists(saved_predictions, "upperbound")

assert_frame_equal(pred, saved_predictions)


Expand All @@ -86,7 +97,7 @@ def test_output_format(train_data, test_data):
original_train_data = train_data.copy()
original_test_data = test_data.copy()
prediction = model.fit(train_data).predict(test_data)
assert set(test_data.columns) == (set(prediction.columns) - {"alarm"})
assert set(test_data.columns) == (set(prediction.columns) - {"alarm", "upperbound"})


def test_validate_data_on_fit(train_data):
Expand All @@ -103,4 +114,9 @@ def test_prediction_witout_labels(train_data, test_data, shared_datadir, Algo):
model.fit(train_data[["n_cases"]])
pred = model.predict(test_data)
saved_predictions = load_predictions(shared_datadir / f"{Algo.__name__}_pred.csv")

# 'upperbound' does not make sense to check for equality, so let's remove it if it exists
pred = drop_column_if_exists(pred, "upperbound")
saved_predictions = drop_column_if_exists(saved_predictions, "upperbound")

assert_frame_equal(pred, saved_predictions)
37 changes: 36 additions & 1 deletion tests/test_timeseries_models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import numpy as np
import pandas as pd
import pytest

from epysurv.models.timeseries import Farrington, GLRPoisson # type: ignore
from epysurv.models.timeseries import ( # type: ignore
Farrington,
FarringtonFlexible,
GLRPoisson,
)

from .utils import load_predictions

Expand All @@ -17,6 +22,36 @@ def test_farrington_timeseries_prediciton(tsc_generator, shared_datadir):
pd.testing.assert_series_equal(pred.alarm, saved_predictions.alarm)


def test_farrington_timeseries_prediction_columns(tsc_generator, shared_datadir):
model = Farrington()
model.fit(tsc_generator.train_gen)
pred = model.predict(tsc_generator.test_gen)

# check for columns
pred_columns = list(pred.columns.values)

# this one is always here
assert "alarm" in pred_columns
# but this one should be here if we call predict() with get_alarm_only = False
assert "upperbound" in pred_columns


def test_farrington_flexible_timeseries_prediction_columns(
tsc_generator, shared_datadir
):
model = FarringtonFlexible()
model.fit(tsc_generator.train_gen)
pred = model.predict(tsc_generator.test_gen)

# check for columns
pred_columns = list(pred.columns.values)

# this one is always here
assert "alarm" in pred_columns
# but this one should be here if we call predict() with get_alarm_only = False
assert "upperbound" in pred_columns


def test_outbreak_case_subtraction():
def test_gen():
df = pd.DataFrame(
Expand Down
6 changes: 6 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,9 @@ def load_simulations(filepath):
)
freq = pd.infer_freq(simulations.index)
return simulations.asfreq(freq)


def drop_column_if_exists(df, column):
if column in df.columns:
df = df.drop(columns=column)
return df

0 comments on commit ed297b4

Please sign in to comment.