From 2edde261886270b8ab6003b9b98f61de1a742ab8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Thu, 25 Jul 2024 12:30:08 -0600 Subject: [PATCH] ensure static features are constant (#391) --- mlforecast/__init__.py | 2 +- mlforecast/core.py | 25 +++++++-- nbs/core.ipynb | 53 ++++++++++++------- .../how-to-guides/transforming_exog.ipynb | 2 +- nbs/forecast.ipynb | 21 +++----- settings.ini | 6 +-- 6 files changed, 68 insertions(+), 41 deletions(-) diff --git a/mlforecast/__init__.py b/mlforecast/__init__.py index d9998b46..c2a37b61 100644 --- a/mlforecast/__init__.py +++ b/mlforecast/__init__.py @@ -1,3 +1,3 @@ -__version__ = "0.13.2" +__version__ = "0.13.3" __all__ = ['MLForecast'] from mlforecast.forecast import MLForecast diff --git a/mlforecast/core.py b/mlforecast/core.py index b509ddfe..22ad8ceb 100644 --- a/mlforecast/core.py +++ b/mlforecast/core.py @@ -293,8 +293,6 @@ def _fit( tfm.set_column_names(id_col, time_col, target_col) sorted_df = tfm.fit_transform(sorted_df) ga.data = sorted_df[target_col].to_numpy() - self.ga = ga - last_idxs_per_serie = self.ga.indptr[1:] - 1 to_drop = [id_col, time_col, target_col] if static_features is None: static_features = [c for c in df.columns if c not in [time_col, target_col]] @@ -302,10 +300,27 @@ def _fit( static_features = [id_col] + static_features else: # static_features defined and contain id_col to_drop = [time_col, target_col] + self.ga = ga + series_starts = ga.indptr[:-1] + series_ends = ga.indptr[1:] - 1 if self._sort_idxs is not None: - last_idxs_per_serie = self._sort_idxs[last_idxs_per_serie] - self.static_features_ = ufp.take_rows(df, last_idxs_per_serie)[static_features] - self.static_features_ = ufp.drop_index_if_pandas(self.static_features_) + series_starts = self._sort_idxs[series_starts] + series_ends = self._sort_idxs[series_ends] + statics_on_starts = ufp.drop_index_if_pandas( + ufp.take_rows(df, series_starts)[static_features] + ) + statics_on_ends = ufp.drop_index_if_pandas( + ufp.take_rows(df, series_ends)[static_features] + ) + for feat in static_features: + if (statics_on_starts[feat] != statics_on_ends[feat]).any(): + raise ValueError( + f"{feat} is declared as a static feature but its values change " + "over time. Please set the `static_features` argument to " + "indicate which features are static.\nIf all of your features " + "are dynamic please set `static_features=[]`." + ) + self.static_features_ = statics_on_ends self.features_order_ = [ c for c in df.columns if c not in to_drop ] + self.features diff --git a/nbs/core.ipynb b/nbs/core.ipynb index aae6c02f..a71288a9 100644 --- a/nbs/core.ipynb +++ b/nbs/core.ipynb @@ -776,8 +776,6 @@ " tfm.set_column_names(id_col, time_col, target_col)\n", " sorted_df = tfm.fit_transform(sorted_df)\n", " ga.data = sorted_df[target_col].to_numpy()\n", - " self.ga = ga\n", - " last_idxs_per_serie = self.ga.indptr[1:] - 1\n", " to_drop = [id_col, time_col, target_col]\n", " if static_features is None:\n", " static_features = [c for c in df.columns if c not in [time_col, target_col]]\n", @@ -785,10 +783,27 @@ " static_features = [id_col] + static_features\n", " else: # static_features defined and contain id_col\n", " to_drop = [time_col, target_col]\n", + " self.ga = ga\n", + " series_starts = ga.indptr[:-1]\n", + " series_ends = ga.indptr[1:] - 1\n", " if self._sort_idxs is not None:\n", - " last_idxs_per_serie = self._sort_idxs[last_idxs_per_serie]\n", - " self.static_features_ = ufp.take_rows(df, last_idxs_per_serie)[static_features]\n", - " self.static_features_ = ufp.drop_index_if_pandas(self.static_features_)\n", + " series_starts = self._sort_idxs[series_starts]\n", + " series_ends = self._sort_idxs[series_ends]\n", + " statics_on_starts = ufp.drop_index_if_pandas(\n", + " ufp.take_rows(df, series_starts)[static_features]\n", + " )\n", + " statics_on_ends = ufp.drop_index_if_pandas(\n", + " ufp.take_rows(df, series_ends)[static_features]\n", + " )\n", + " for feat in static_features:\n", + " if (statics_on_starts[feat] != statics_on_ends[feat]).any():\n", + " raise ValueError(\n", + " f'{feat} is declared as a static feature but its values change '\n", + " 'over time. Please set the `static_features` argument to '\n", + " 'indicate which features are static.\\nIf all of your features '\n", + " 'are dynamic please set `static_features=[]`.'\n", + " )\n", + " self.static_features_ = statics_on_ends\n", " self.features_order_ = [c for c in df.columns if c not in to_drop] + self.features\n", " return self\n", "\n", @@ -1647,7 +1662,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L466){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L481){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "## TimeSeries.fit_transform\n", "\n", @@ -1660,16 +1675,16 @@ "> max_horizon:Optional[int]=None,\n", "> return_X_y:bool=False, as_numpy:bool=False)\n", "\n", - "Add the features to `data` and save the required information for the predictions step.\n", + "*Add the features to `data` and save the required information for the predictions step.\n", "\n", "If not all features are static, specify which ones are in `static_features`.\n", "If you don't want to drop rows with null values after the transformations set `dropna=False`\n", - "If `keep_last_n` is not None then that number of observations is kept across all series for updates." + "If `keep_last_n` is not None then that number of observations is kept across all series for updates.*" ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L466){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L481){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "## TimeSeries.fit_transform\n", "\n", @@ -1682,11 +1697,11 @@ "> max_horizon:Optional[int]=None,\n", "> return_X_y:bool=False, as_numpy:bool=False)\n", "\n", - "Add the features to `data` and save the required information for the predictions step.\n", + "*Add the features to `data` and save the required information for the predictions step.\n", "\n", "If not all features are static, specify which ones are in `static_features`.\n", "If you don't want to drop rows with null values after the transformations set `dropna=False`\n", - "If `keep_last_n` is not None then that number of observations is kept across all series for updates." + "If `keep_last_n` is not None then that number of observations is kept across all series for updates.*" ] }, "execution_count": null, @@ -1917,7 +1932,9 @@ "ts = TimeSeries(**flow_config)\n", "df = ts.fit_transform(series, id_col='unique_id', time_col='ds', target_col='y')\n", "non_std_series = series.reset_index().rename(columns={'unique_id': 'some_id', 'ds': 'timestamp', 'y': 'value'})\n", - "non_std_res = ts.fit_transform(non_std_series, id_col='some_id', time_col='timestamp', target_col='value')\n", + "non_std_res = ts.fit_transform(\n", + " non_std_series, id_col='some_id', time_col='timestamp', target_col='value', static_features=[]\n", + ")\n", "non_std_res = non_std_res.reset_index(drop=True)\n", "pd.testing.assert_frame_equal(\n", " df.reset_index(),\n", @@ -1960,7 +1977,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L711){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L726){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "## TimeSeries.predict\n", "\n", @@ -1975,7 +1992,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L711){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L726){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "## TimeSeries.predict\n", "\n", @@ -2114,7 +2131,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L817){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L831){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "## TimeSeries.update\n", "\n", @@ -2122,12 +2139,12 @@ "> (df:Union[pandas.core.frame.DataFrame,polars.dataframe\n", "> .frame.DataFrame])\n", "\n", - "Update the values of the stored series." + "*Update the values of the stored series.*" ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L817){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L831){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "## TimeSeries.update\n", "\n", @@ -2135,7 +2152,7 @@ "> (df:Union[pandas.core.frame.DataFrame,polars.dataframe\n", "> .frame.DataFrame])\n", "\n", - "Update the values of the stored series." + "*Update the values of the stored series.*" ] }, "execution_count": null, diff --git a/nbs/docs/how-to-guides/transforming_exog.ipynb b/nbs/docs/how-to-guides/transforming_exog.ipynb index 492c3340..df51588f 100644 --- a/nbs/docs/how-to-guides/transforming_exog.ipynb +++ b/nbs/docs/how-to-guides/transforming_exog.ipynb @@ -605,7 +605,7 @@ " lags=[1],\n", " date_features=['dayofweek'],\n", ")\n", - "fcst.preprocess(series_with_prices, dropna=True).head()" + "fcst.preprocess(series_with_prices, static_features=[], dropna=True).head()" ] }, { diff --git a/nbs/forecast.ipynb b/nbs/forecast.ipynb index 714c1c1b..68a2619f 100644 --- a/nbs/forecast.ipynb +++ b/nbs/forecast.ipynb @@ -4525,21 +4525,12 @@ "execution_count": null, "id": "4e9188fd-8264-41d1-a4d7-89fa51d915d4", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_79672/2131933040.py:4: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", - " non_std_series['ds'] = non_std_series.groupby('unique_id').cumcount()\n" - ] - } - ], + "outputs": [], "source": [ "#| hide\n", "series = generate_daily_series(100, equal_ends=True, n_static_features=2, static_as_categorical=False)\n", "non_std_series = series.copy()\n", - "non_std_series['ds'] = non_std_series.groupby('unique_id').cumcount()\n", + "non_std_series['ds'] = non_std_series.groupby('unique_id', observed=True).cumcount()\n", "non_std_series = non_std_series.rename(columns={'unique_id': 'some_id', 'ds': 'time', 'y': 'value'})\n", "models = [\n", " lgb.LGBMRegressor(n_jobs=1, random_state=0, verbosity=-1),\n", @@ -4705,7 +4696,9 @@ " fitted_pl = fcst_pl.forecast_fitted_values()\n", " preds_pl = fcst_pl.predict(X_df=prices_pl, **predict_kwargs)\n", " preds_pl_subset = fcst_pl.predict(X_df=prices_pl, ids=fcst_pl.ts.uids[[0, 6]], **predict_kwargs)\n", - " cv_pl = fcst_pl.cross_validation(series_pl, n_windows=2, h=horizon, fitted=True, as_numpy=as_numpy)\n", + " cv_pl = fcst_pl.cross_validation(\n", + " series_pl, n_windows=2, h=horizon, fitted=True, static_features=['product_id', 'static_1'], as_numpy=as_numpy\n", + " )\n", " cv_fitted_pl = fcst_pl.cross_validation_fitted_values()\n", " \n", " fcst_pd = MLForecast(**cfg)\n", @@ -4714,7 +4707,9 @@ " preds_pd = fcst_pd.predict(X_df=prices_pd, **predict_kwargs)\n", " preds_pd_subset = fcst_pd.predict(X_df=prices_pd, ids=fcst_pd.ts.uids[[0, 6]], **predict_kwargs)\n", " assert preds_pd_subset['unique_id'].unique().tolist() == ['id_0', 'id_6']\n", - " cv_pd = fcst_pd.cross_validation(series_pd, n_windows=2, h=horizon, fitted=True, as_numpy=as_numpy)\n", + " cv_pd = fcst_pd.cross_validation(\n", + " series_pd, n_windows=2, h=horizon, fitted=True, static_features=['product_id', 'static_1'], as_numpy=as_numpy\n", + " )\n", " cv_fitted_pd = fcst_pd.cross_validation_fitted_values()\n", "\n", " if max_horizon is not None:\n", diff --git a/settings.ini b/settings.ini index c55e79bf..c35685cf 100644 --- a/settings.ini +++ b/settings.ini @@ -8,7 +8,7 @@ author = José Morales author_email = jmoralz92@gmail.com copyright = Nixtla branch = main -version = 0.13.2 +version = 0.13.3 min_python = 3.8 audience = Developers language = English @@ -35,8 +35,8 @@ title = mlforecast tst_flags = polars black_formatting = True readme_nb = index.ipynb -allowed_metadata_keys = -allowed_cell_metadata_keys = +allowed_metadata_keys = +allowed_cell_metadata_keys = jupyter_hooks = True clean_ids = True clear_all = False