Merge pull request #256 from winedarksea/dev

0.6.21
winedarksea · Mar 5, 2025 · 7258a7a · 7258a7a
2 parents 2735174 + 2286bf9
commit 7258a7a
Show file tree

Hide file tree

Showing 33 changed files with 174 additions and 111 deletions.
diff --git a/TODO.md b/TODO.md
@@ -13,10 +13,8 @@
 * Forecasts are desired for the future immediately following the most recent data.
 * trimmed_mean to AverageValueNaive
 
-# 0.6.20 🇺🇦 🇺🇦 🇺🇦
-* transformer bug fixes
-* Prophet package adjustments
-* linear model singular matrix handling
+# 0.6.21 🇺🇦 🇺🇦 🇺🇦
+* Prophet and Cassandra bug fixes
 
 ### Unstable Upstream Pacakges (those that are frequently broken by maintainers)
 * Pytorch-Forecasting

diff --git a/autots/__init__.py b/autots/__init__.py
@@ -27,7 +27,7 @@
 from autots.models.cassandra import Cassandra
 
 
-__version__ = '0.6.20'
+__version__ = '0.6.21'
 
 TransformTS = GeneralTransformer
 

diff --git a/autots/evaluator/auto_ts.py b/autots/evaluator/auto_ts.py
@@ -156,6 +156,7 @@ class AutoTS(object):
         best_model_params (dict): model params
         best_model_transformation_params (dict): transformation parameters
         best_model_ensemble (int): Ensemble type int id
+        used_frequency (str): datetime frequency offset string
         regression_check (bool): If True, the best_model uses an input 'User' future_regressor
         df_wide_numeric (pd.DataFrame): dataframe containing shaped final data, will include preclean
         initial_results.model_results (object): contains a collection of result metrics
@@ -439,9 +440,9 @@ def __init__(
 
                 full_params['transformations'] = transformations
                 full_params['transformation_params'] = transformation_params
-                self.initial_template.loc[index, 'TransformationParameters'] = (
-                    json.dumps(full_params)
-                )
+                self.initial_template.loc[
+                    index, 'TransformationParameters'
+                ] = json.dumps(full_params)
 
         self.regressor_used = False
         self.subset_flag = False
@@ -2022,10 +2023,10 @@ def _run_template(
             self.model_count = template_result.model_count
         # capture results from lower-level template run
         if "TotalRuntime" in template_result.model_results.columns:
-            template_result.model_results['TotalRuntime'] = (
-                template_result.model_results['TotalRuntime'].fillna(
-                    pd.Timedelta(seconds=60)
-                )
+            template_result.model_results[
+                'TotalRuntime'
+            ] = template_result.model_results['TotalRuntime'].fillna(
+                pd.Timedelta(seconds=60)
             )
         else:
             # trying to catch a rare and sneaky bug (perhaps some variety of beetle?)
@@ -2162,9 +2163,9 @@ def _run_validations(
                         frac=0.8, random_state=self.random_seed
                     ).reindex(idx)
                 nan_frac = val_df_train.shape[1] / num_validations
-                val_df_train.iloc[-2:, int(nan_frac * y) : int(nan_frac * (y + 1))] = (
-                    np.nan
-                )
+                val_df_train.iloc[
+                    -2:, int(nan_frac * y) : int(nan_frac * (y + 1))
+                ] = np.nan
 
             # run validation template on current slice
             result = self._run_template(
@@ -4845,9 +4846,9 @@ def diagnose_params(self, target='runtime', waterfall_plots=True):
                     )
                     y = pd.json_normalize(json.loads(row["ModelParameters"]))
                     y.index = [row['ID']]
-                    y['Model'] = (
-                        x  # might need to remove this and do analysis independently for each
-                    )
+                    y[
+                        'Model'
+                    ] = x  # might need to remove this and do analysis independently for each
                     res.append(
                         pd.DataFrame(
                             {

diff --git a/autots/models/base.py b/autots/models/base.py
@@ -490,18 +490,18 @@ def long_form_results(
             value_name=value_name,
             id_vars="datetime",
         ).set_index("datetime")
-        upload_upper[interval_name] = (
-            f"{round(100 - ((1- self.prediction_interval)/2) * 100, 0)}%"
-        )
+        upload_upper[
+            interval_name
+        ] = f"{round(100 - ((1- self.prediction_interval)/2) * 100, 0)}%"
         upload_lower = pd.melt(
             self.lower_forecast.rename_axis(index='datetime').reset_index(),
             var_name=id_name,
             value_name=value_name,
             id_vars="datetime",
         ).set_index("datetime")
-        upload_lower[interval_name] = (
-            f"{round(((1- self.prediction_interval)/2) * 100, 0)}%"
-        )
+        upload_lower[
+            interval_name
+        ] = f"{round(((1- self.prediction_interval)/2) * 100, 0)}%"
 
         upload = pd.concat([upload, upload_upper, upload_lower], axis=0)
         if datetime_column is not None:

diff --git a/autots/models/cassandra.py b/autots/models/cassandra.py
@@ -480,6 +480,7 @@ def fit(
                 multivar_df = (
                     trs_df.T.groupby(self.categorical_groups)  # axis=1
                     .mean()
+                    .transpose()
                     .iloc[lag_1_indx]
                 )
                 multivar_df.index = self.df.index
@@ -543,7 +544,7 @@ def fit(
                 x_t = create_changepoint_features(
                     self.df.index,
                     changepoint_spacing=60,
-                    changepoint_distance_end=120,
+                    changepoint_distance_end=180,
                 )
                 x_list.append(x_t)
             else:
@@ -643,20 +644,24 @@ def fit(
                     print(f"Dropping colinear feature columns {corel}")
                 # x_array = x_array.drop(columns=corel)
                 self.drop_colz.extend(corel.tolist())
+
         if self.max_multicolinearity is not None:
             colin = x_array.columns[w < self.max_multicolinearity]
-            if len(colin) > 0:
+            if len(colin) > 1:
                 if self.verbose > 2:
                     print(f"Dropping multi-colinear feature columns {colin}")
                 # x_array = x_array.drop(columns=colin)
                 self.drop_colz.extend(colin.tolist())
+        if len(set(self.drop_colz)) == x_array.shape[1]:
+            self.drop_colz = list(set(self.drop_colz))[1:]
         x_array = x_array.drop(columns=self.drop_colz)
 
         # things we want modeled but want to discard from evaluation (standins)
         remove_patterns = [
             "randnorm_",
             "rolling_trend_",
             "randomwalk_",
+            "changepoint_",
         ]  # "intercept" added after, so not included
 
         # RUN LINEAR MODEL
@@ -810,6 +815,7 @@ def fit(
             self.residual_uncertainty_upper_std = res_upper.std()
             self.residual_uncertainty_lower_std = res_lower.std()
         else:
+            slope = np.zeros_like(self.df)
             self.residual_uncertainty_upper = pd.Series(0, index=self.df.columns)
             self.residual_uncertainty_lower = pd.Series(0, index=self.df.columns)
             self.residual_uncertainty_upper_std = pd.Series(0, index=self.df.columns)

diff --git a/autots/models/ensemble.py b/autots/models/ensemble.py
@@ -2340,15 +2340,15 @@ def _buildup_mosaics(
             f"Mosaic Ensemble failed on model {row[3]} series {row[2]} and period {row[1]} due to missing model: {e} "
             + mi
         ) from e
-    melted['forecast'] = (
-        fore  # [forecasts[row[3]][row[2]].iloc[row[1]] for row in melted.itertuples()]
-    )
-    melted['upper_forecast'] = (
-        u_fore  # [upper_forecasts[row[3]][row[2]].iloc[row[1]] for row in melted.itertuples()]
-    )
-    melted['lower_forecast'] = (
-        l_fore  # [lower_forecasts[row[3]][row[2]].iloc[row[1]] for row in melted.itertuples()]
-    )
+    melted[
+        'forecast'
+    ] = fore  # [forecasts[row[3]][row[2]].iloc[row[1]] for row in melted.itertuples()]
+    melted[
+        'upper_forecast'
+    ] = u_fore  # [upper_forecasts[row[3]][row[2]].iloc[row[1]] for row in melted.itertuples()]
+    melted[
+        'lower_forecast'
+    ] = l_fore  # [lower_forecasts[row[3]][row[2]].iloc[row[1]] for row in melted.itertuples()]
 
     forecast_df = melted.pivot(
         values="forecast", columns="series_id", index="forecast_period"

diff --git a/autots/models/prophet.py b/autots/models/prophet.py
@@ -67,6 +67,7 @@ def __init__(
         random_seed: int = 2024,
         verbose: int = 0,
         n_jobs: int = None,
+        **kwargs,
     ):
         ModelObject.__init__(
             self,
@@ -202,9 +203,10 @@ def seek_the_oracle(
             # n_changepoints -> changepoint_spacing
             # changepoint_range -> changepoint_distance_end
             if self.changepoint_range > 1 or self.changepoint_distance_end is not None:
+                non_null_indices = np.where(current_series["y"].notnull())[0]
                 pargs['changepoints'] = get_changepoints(
-                    current_series.index[0],
-                    current_series.index[-1],
+                    current_series.index[non_null_indices[0]],
+                    current_series.index[non_null_indices[-1]],
                     changepoint_spacing=(
                         int(len(current_series.index) / self.n_changepoints)
                         if self.changepoint_spacing is None
@@ -219,19 +221,26 @@ def seek_the_oracle(
                 pargs.pop("changepoint_range", None)
                 pargs.pop("n_changepoints", None)
             m = Prophet(**pargs)
+            # as currently written this customization only works on daily data
             if self.weekly_seasonality_prior_scale not in [None, "None"]:
                 m.add_seasonality(
                     name='weekly',
-                    period=7,
+                    period=168 if "H" in self.frequency else 7,
                     fourier_order=4,
                     prior_scale=self.weekly_seasonality_prior_scale,
                 )
             if self.yearly_seasonality_prior_scale not in [None, "None"]:
                 if self.yearly_seasonality_order in [None, "None"]:
                     self.yearly_seasonality_order = 12
+                if "W" in str(self.frequency).upper():
+                    yperiod = 52.18
+                elif "M" in str(self.frequency).upper():
+                    yperiod = 12
+                else:
+                    yperiod = 365.25
                 m.add_seasonality(
                     name='yearly',
-                    period=365.25,
+                    period=yperiod,
                     fourier_order=int(self.yearly_seasonality_order),
                     prior_scale=self.yearly_seasonality_prior_scale,
                 )
@@ -248,7 +257,8 @@ def seek_the_oracle(
                 else:
                     m.add_country_holidays(country_name=args['holiday_country'])
             else:
-                raise ValueError("`holiday` arg for Prophet not recognized")
+                pass
+                # raise ValueError("`holiday` arg for Prophet not recognized")
             if args['regression_type'] in ['User', 'user']:
                 current_series = pd.concat(
                     [current_series, args['regressor_train']], axis=1
@@ -427,14 +437,13 @@ def get_new_params(self, method: str = 'random'):
         yearly_seasonality_order = None
         yearly_seasonality_prior_scale = random.choices(
             [None, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 15, 20, 25, 40],  # default 10
-            [0.8, 0.2, 0.05, 0.05, 0.05, 0.05, 0.1, 0.05, 0.05, 0.05, 0.05],
+            [0.4, 0.2, 0.05, 0.05, 0.05, 0.05, 0.1, 0.05, 0.05, 0.05, 0.05],
         )[0]
         if yearly_seasonality_prior_scale is not None:
             yearly_seasonality_order = random.choices(
                 [2, 6, 12, 30], [0.1, 0.2, 0.5, 0.1]
             )[0]
-
-        return {
+        params = {
             'holiday': holiday_choice,
             'regression_type': regression_choice,
             'changepoint_prior_scale': random.choices(
@@ -445,9 +454,37 @@ def get_new_params(self, method: str = 'random'):
                 [0.01, 0.1, 1.0, 10.0, 15, 20, 25, 40],  # default 10
                 [0.05, 0.05, 0.05, 0.8, 0.05, 0.05, 0.05, 0.05],
             )[0],
-            'yearly_seasonality_prior_scale': yearly_seasonality_prior_scale,
-            "yearly_seasonality_order": yearly_seasonality_order,
-            'weekly_seasonality_prior_scale': random.choices(
+            'holidays_prior_scale': random.choices(
+                [0.01, 0.1, 1.0, 10.0, 15, 20, 25, 40],  # default 10
+                [0.05, 0.05, 0.05, 0.8, 0.05, 0.05, 0.05, 0.05],
+            )[0],
+            'seasonality_mode': random.choice(['additive', 'multiplicative']),
+            'growth': random.choices(["linear", "flat"], [0.9, 0.1])[0],
+            "trend_phi": random.choices(
+                [None, 0.98, 0.999, 0.95, 0.8, 0.99], [0.8, 0.1, 0.2, 0.1, 0.1, 0.05]
+            )[0],
+        }
+        way = random.choice(["new", "old"])
+        if way == "old":
+            params["n_changepoints"] = random.choices(
+                [5, 10, 20, 25, 30, 40, 50], [0.05, 0.1, 0.1, 0.9, 0.1, 0.05, 0.05]
+            )[0]
+            params["changepoint_range"] = random.choices(
+                [0.8, 0.85, 0.9, 0.95, 0.98, 30, 60, 180, 360],
+                [0.4, 0.3, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
+            )[0]
+        else:
+            params["changepoint_spacing"] = random.choices(
+                [10, 20, 180, 30, 40, 50, 60], [0.05, 0.1, 0.1, 0.1, 0.1, 0.05, 0.9]
+            )[0]
+            params["changepoint_distance_end"] = random.choices(
+                [10, 20, 180, 30, 40, 50, 60], [0.05, 0.1, 0.1, 0.1, 0.1, 0.05, 0.9]
+            )[0]
+        way = random.choice(["new", "old"])
+        if way == "new":
+            params["yearly_seasonality_prior_scale"] = yearly_seasonality_prior_scale
+            params["yearly_seasonality_order"] = yearly_seasonality_order
+            params["weekly_seasonality_prior_scale"] = random.choices(
                 [
                     None,
                     0.0001,
@@ -461,40 +498,28 @@ def get_new_params(self, method: str = 'random'):
                     25,
                     40,
                 ],  # default 10
-                [0.8, 0.2, 0.05, 0.05, 0.05, 0.05, 0.1, 0.05, 0.05, 0.05, 0.05],
-            )[0],
-            'holidays_prior_scale': random.choices(
-                [0.01, 0.1, 1.0, 10.0, 15, 20, 25, 40],  # default 10
-                [0.05, 0.05, 0.05, 0.8, 0.05, 0.05, 0.05, 0.05],
-            )[0],
-            'seasonality_mode': random.choice(['additive', 'multiplicative']),
-            'changepoint_range': random.choices(
-                [0.8, 0.85, 0.9, 0.95, 0.98, 30, 60],
-                [0.9, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2],
-            )[0],
-            'growth': random.choices(["linear", "flat"], [0.9, 0.1])[0],
-            'n_changepoints': random.choices(
-                [5, 10, 20, 25, 30, 40, 50], [0.05, 0.1, 0.1, 0.9, 0.1, 0.05, 0.05]
-            )[0],
-            'changepoint_spacing': random.choices(
-                [10, 20, 25, 30, 40, 50, 60], [0.05, 0.1, 0.1, 0.1, 0.1, 0.05, 0.9]
-            )[0],
-            "trend_phi": random.choices(
-                [None, 0.98, 0.999, 0.95, 0.8], [0.8, 0.1, 0.2, 0.1, 0.1]
-            )[0],
-        }
+                [0.4, 0.2, 0.05, 0.05, 0.05, 0.05, 0.1, 0.05, 0.05, 0.05, 0.05],
+            )[0]
+        else:
+            pass
+        return params
 
     def get_params(self):
         """Return dict of current parameters."""
         return {
             'holiday': self.holiday,
             'regression_type': self.regression_type,
             "growth": self.growth,
-            "n_changepoints": self.n_changepoints,
-            "changepoint_prior_scale": self.changepoint_prior_scale,
             "seasonality_mode": self.seasonality_mode,
+            "changepoint_prior_scale": self.changepoint_prior_scale,
+            "n_changepoints": self.n_changepoints,
             "changepoint_range": self.changepoint_range,
+            "changepoint_spacing": self.changepoint_spacing,
+            "changepoint_distance_end": self.changepoint_distance_end,
             "seasonality_prior_scale": self.seasonality_prior_scale,
+            "yearly_seasonality_prior_scale": self.yearly_seasonality_prior_scale,
+            "yearly_seasonality_order": self.yearly_seasonality_order,
+            "weekly_seasonality_prior_scale": self.weekly_seasonality_prior_scale,
             "holidays_prior_scale": self.holidays_prior_scale,
             "trend_phi": self.trend_phi,
         }
@@ -541,7 +566,7 @@ def get_changepoints(
     cp_csv = custom_changepoints.replace(" ", "")
     if len(cp_csv) > 0:
         timestamps = [pd.Timestamp(cp_str) for cp_str in cp_csv.split(",")]
-        changepoints = changepoints.append(pd.Series(timestamps))
+        changepoints = pd.concat([changepoints, pd.Series(timestamps)], axis=0)
     changepoints = changepoints.drop_duplicates().sort_values()
     changepoints = changepoints.loc[
         (changepoints > training_start_ds) & (changepoints < training_end_ds)

diff --git a/autots/models/sklearn.py b/autots/models/sklearn.py
@@ -335,7 +335,8 @@ def rolling_x_regressor_regressor(
         X = X.set_index("series_id", append=True)
     if series_id is not None:
         hashed = (
-            int(hashlib.sha256(str(series_id).encode('utf-8')).hexdigest(), 16) % 10**16
+            int(hashlib.sha256(str(series_id).encode('utf-8')).hexdigest(), 16)
+            % 10**16
         )
         X['series_id'] = hashed
     return X
@@ -3984,7 +3985,9 @@ def _rbf_kernel(self, x1, x2, gamma):
         if gamma is None:
             gamma = 1.0 / x1.shape[1]
         distance = (
-            np.sum(x1**2, 1).reshape(-1, 1) + np.sum(x2**2, 1) - 2 * np.dot(x1, x2.T)
+            np.sum(x1**2, 1).reshape(-1, 1)
+            + np.sum(x2**2, 1)
+            - 2 * np.dot(x1, x2.T)
         )
         return np.exp(-gamma * distance)
 

diff --git a/autots/tools/profile.py b/autots/tools/profile.py
@@ -129,9 +129,9 @@ def profile_time_series(
         & (metrics_df['cv_squared'] >= cvar_threshold),
         'PROFILE',
     ] = 'lumpy'
-    metrics_df.loc[metrics_df['zero_diff_proportion'] >= flat_threshold, 'PROFILE'] = (
-        'flat'
-    )
+    metrics_df.loc[
+        metrics_df['zero_diff_proportion'] >= flat_threshold, 'PROFILE'
+    ] = 'flat'
     metrics_df.loc[
         metrics_df['null_percentage'] >= new_product_threshold, 'PROFILE'
     ] = 'new_product'

diff --git a/docs/build/doctrees/environment.pickle b/docs/build/doctrees/environment.pickle
diff --git a/docs/build/doctrees/source/autots.doctree b/docs/build/doctrees/source/autots.doctree
diff --git a/docs/build/doctrees/source/autots.evaluator.doctree b/docs/build/doctrees/source/autots.evaluator.doctree
diff --git a/docs/build/doctrees/source/autots.models.doctree b/docs/build/doctrees/source/autots.models.doctree