From dd47e85a89a92c3f5ccf4c48f9a70c472a6c9fb1 Mon Sep 17 00:00:00 2001 From: Aadyot Bhatnagar Date: Mon, 18 Apr 2022 14:28:41 -0700 Subject: [PATCH] Allow TimeSeries to include NaN values. (#85) --- merlion/utils/time_series.py | 44 ++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/merlion/utils/time_series.py b/merlion/utils/time_series.py index dadb1fd48..ebe83e957 100644 --- a/merlion/utils/time_series.py +++ b/merlion/utils/time_series.py @@ -726,24 +726,24 @@ def to_csv(self, file_name): self.to_pd().to_csv(file_name) @classmethod - def from_pd(cls, df: Union[pd.Series, pd.DataFrame, np.ndarray], check_times=True, freq="1h"): + def from_pd(cls, df: Union[pd.Series, pd.DataFrame, np.ndarray], check_times=True, drop_nan=True, freq="1h"): """ - :param df: A pandas DataFrame with a DatetimeIndex. Each column - corresponds to a different variable of the time series, and the - key of column (in sorted order) give the relative order of those - variables (in the list self.univariates). Missing values should be - represented with ``NaN``. May also be a pandas Series for univariate - time series. - :param check_times: whether to check that all times in the index are - unique (up to the millisecond) and sorted. - :param freq: if ``df`` is not indexed by time, this is the frequency - at which we will assume it is sampled. + :param df: A ``pandas.DataFrame`` with a ``DatetimeIndex``. Each column corresponds to a different variable of + the time series, and the key of column (in sorted order) give the relative order of those variables in + ``self.univariates``. Missing values should be represented with ``NaN``. May also be a ``pandas.Series`` + for single-variable time series. + :param check_times: whether to check that all times in the index are unique (up to the millisecond) and sorted. + :param drop_nan: whether to drop all ``NaN`` entries before creating the time series. Specifying ``False`` is + useful if you wish to impute the values on your own. + :param freq: if ``df`` is not indexed by time, this is the frequency at which we will assume it is sampled. :rtype: TimeSeries :return: the `TimeSeries` object corresponding to ``df``. """ if isinstance(df, pd.Series): - return cls({df.name: UnivariateTimeSeries.from_pd(df[~df.isna()])}) + if drop_nan: + df = df[~df.isna()] + return cls({df.name: UnivariateTimeSeries.from_pd(df)}) elif isinstance(df, np.ndarray): arr = df.reshape(len(df), -1).T ret = cls([UnivariateTimeSeries(time_stamps=None, values=v, freq=freq) for v in arr], check_aligned=False) @@ -771,12 +771,18 @@ def from_pd(cls, df: Union[pd.Series, pd.DataFrame, np.ndarray], check_times=Tru f"type {type(df.index).__name__}" ) - ret = cls( - ValIterOrderedDict( - [(k, UnivariateTimeSeries.from_pd(ser[~ser.isna()], freq=freq)) for k, ser in df.items()] - ), - check_aligned=False, - ) + if drop_nan: + ret = cls( + ValIterOrderedDict( + [(k, UnivariateTimeSeries.from_pd(ser[~ser.isna()], freq=freq)) for k, ser in df.items()] + ), + check_aligned=False, + ) + else: + ret = cls( + ValIterOrderedDict([(k, UnivariateTimeSeries.from_pd(ser, freq=freq)) for k, ser in df.items()]), + check_aligned=False, + ) ret._is_aligned = aligned return ret @@ -861,7 +867,7 @@ def align( "Attempting to align an empty time series to a set of reference time stamps or a " "fixed granularity. Doing nothing." ) - return self.__class__.from_pd(self.to_pd()) + return TimeSeries.from_pd(self.to_pd()) if reference is not None or alignment_policy is AlignPolicy.FixedReference: if reference is None: