0.3.5

winedarksea · Aug 30, 2021 · d20e3bb · d20e3bb
2 parents 0ab2e48 + f24d7ad
commit d20e3bb
Show file tree

Hide file tree

Showing 39 changed files with 1,277 additions and 479 deletions.
diff --git a/README.md b/README.md
@@ -117,6 +117,7 @@ Also take a look at the [production_example.py](https://github.com/winedarksea/A
 * Set `model_interrupt=True` which passes over the current model when a `KeyboardInterrupt` ie `crtl+c` is pressed (although if the interrupt falls between generations it will stop the entire training).
 * Use the `result_file` method of `.fit()` which will save progress after each generation - helpful to save progress if a long training is being done. Use `import_results` to recover.
 * While Transformations are pretty fast, setting `transformer_max_depth` to a lower number (say, 2) will increase speed. Also utilize `transformer_list`.
+* Check out [this example](https://github.com/winedarksea/AutoTS/discussions/76) of using AutoTS with pandas UDF.
 * Ensembles are obviously slower to predict because they run many models, 'distance' models 2x slower, and 'simple' models 3x-5x slower.
 	* `ensemble='horizontal-max'` with `model_list='no_shared_fast'` can scale relatively well given many cpu cores because each model is only run on the series it is needed for.
 * Reducing `num_validations` and `models_to_validate` will decrease runtime but may lead to poorer model selections.

diff --git a/TODO.md b/TODO.md
@@ -15,18 +15,16 @@
 * Forecasts are desired for the future immediately following the most recent data.
 
 # Latest
-* improvements to joblib parallelized models (not copying the full df)
-* additonal parameter checks
-* made "auto" cpu_count even more conservative
-* improved 'Score' generation. It should now be more equally weighted across metrics.
-* fixed potential bug for horizontal ensemble selection if perfect forecasts were delivered
-* Horizontal ensembles now chosen by combination of multiple metrics and metric_weighting (mae, rmse, spl, contour)
-* re-weighted fillna probabilities for random choice
-* addressed a few deprecation warnings
-* new plot_horizontal() function for AutoTS to quickly visual horizontal ensembles
-* Probabilistic and HDist ensembles are now deprecated (they can still be run by model_forecast but not by AutoTS class)
-* new introduce_na parameter which makes series more robust to the last values being NaN in final but never in any validation
-* Mosaic Ensembles! These can offer major improvements to MAE, but are also less stable than horizontal ensembles.
+* New Transfromer ScipyFilter
+* New models Univariate and MultivariateMotif
+* 'midhinge' and "weighted_mean" to AverageValueNaive
+* Add passing regressors to WindowRegression and made more efficient window generation
+* more plotting methods: plot_horizontal_transformers
+* for most -Regression type models, `model_params` is now treated as kwargs and can accept any args for that model
+* ExtraTrees and RadiusRegressor to -Regression type models
+* bug fix in generate_score_per_series
+* 'Generation' now tracked in results table, plus plotting method for generation loss
+
 
 # Errors: 
 DynamicFactor holidays 	Exceptions 'numpy.ndarray' object has no attribute 'values'

diff --git a/autots/__init__.py b/autots/__init__.py
@@ -19,7 +19,7 @@
 from autots.tools.regressor import create_lagged_regressor
 from autots.evaluator.auto_model import model_forecast
 
-__version__ = '0.3.4'
+__version__ = '0.3.5'
 
 TransformTS = GeneralTransformer
 

diff --git a/autots/evaluator/auto_model.py b/autots/evaluator/auto_model.py
@@ -28,6 +28,7 @@
     AverageValueNaive,
     SeasonalNaive,
     ZeroesNaive,
+    Motif,
 )
 from autots.models.statsmodels import (
     GLS,
@@ -400,6 +401,28 @@ def ModelMonster(
             **parameters,
         )
 
+        return model
+    elif model == 'MultivariateMotif':
+        model = Motif(
+            frequency=frequency,
+            prediction_interval=prediction_interval,
+            random_seed=random_seed,
+            verbose=verbose,
+            n_jobs=n_jobs,
+            multivariate=True,
+            **parameters,
+        )
+        return model
+    elif model == 'UnivariateMotif':
+        model = Motif(
+            frequency=frequency,
+            prediction_interval=prediction_interval,
+            random_seed=random_seed,
+            verbose=verbose,
+            n_jobs=n_jobs,
+            multivariate=False,
+            **parameters,
+        )
         return model
     else:
         raise AttributeError(
@@ -1061,6 +1084,7 @@ def TemplateWizard(
                     'Ensemble': ensemble_input,
                     'Exceptions': np.nan,
                     'Runs': 1,
+                    'Generation': current_generation,
                     'ValidationRound': validation_round,
                 },
                 index=[0],
@@ -1137,6 +1161,7 @@ def TemplateWizard(
                         'TotalRuntime': datetime.timedelta(0),
                         'Exceptions': "KeyboardInterrupt by user",
                         'Runs': 1,
+                        'Generation': current_generation,
                         'ValidationRound': validation_round,
                     },
                     index=[0],
@@ -1184,6 +1209,7 @@ def TemplateWizard(
                     'TotalRuntime': datetime.timedelta(0),
                     'Exceptions': repr(e),
                     'Runs': 1,
+                    'Generation': current_generation,
                     'ValidationRound': validation_round,
                 },
                 index=[0],
@@ -1698,12 +1724,17 @@ def generate_score_per_series(results_object, metric_weighting, total_validation
             overall_score = overall_score + (contour_score * contour_weighting)
     # remove basic duplicates
     local_results = results_object.model_results.copy()
+    local_results = local_results[local_results['Exceptions'].isna()]
     local_results = local_results.sort_values(by="TotalRuntimeSeconds", ascending=True)
     local_results.drop_duplicates(
         subset=['ValidationRound', 'smape', 'mae', 'spl'], keep="first", inplace=True
     )
     # select only models run through all validations
+    # run_count = temp.groupby(level=0).count().mean(axis=1)
+    # models_to_use = run_count[run_count >= total_validations].index.tolist()
     run_count = local_results[['Model', 'ID']].groupby("ID").count()
     models_to_use = run_count[run_count['Model'] >= total_validations].index.tolist()
     overall_score = overall_score[overall_score.index.isin(models_to_use)]
+    # take the average score across validations
+    overall_score = overall_score.groupby(level=0).mean()
     return overall_score
diff --git a/autots/evaluator/auto_ts.py b/autots/evaluator/auto_ts.py
@@ -88,6 +88,13 @@ class AutoTS(object):
     Attributes:
         best_model (pandas.DataFrame): DataFrame containing template for the best ranked model
         regression_check (bool): If True, the best_model uses an input 'User' future_regressor
+
+    Methods:
+        fit, predict
+        export_template, import_template, import_results
+        results, failure_rate
+        horizontal_to_df, mosaic_to_df
+        plot_horizontal, plot_horizontal_transformers, plot_generation_loss
     """
 
     def __init__(
@@ -499,7 +506,7 @@ def fit(
         ensemble = self.ensemble
 
         # check if NaN in last row
-        nan_tail = df_wide_numeric.tail(1).isna().sum(axis=1).iloc[0] > 0
+        self._nan_tail = df_wide_numeric.tail(1).isna().sum(axis=1).iloc[0] > 0
 
         self.df_wide_numeric = df_wide_numeric
         self.startTimeStamps = df_wide_numeric.notna().idxmax()
@@ -705,6 +712,7 @@ def fit(
                     model_interrupt=self.model_interrupt,
                     grouping_ids=self.grouping_ids,
                     random_seed=random_seed,
+                    current_generation=(current_generation + 1),
                     verbose=verbose,
                     n_jobs=self.n_jobs,
                 )
@@ -890,9 +898,8 @@ def fit(
                     val_future_regressor_test = []
 
                 # force NaN for robustness
-                if self.introduce_na or (self.introduce_na is None and nan_tail):
+                if self.introduce_na or (self.introduce_na is None and self._nan_tail):
                     nan_frac = val_df_train.shape[1] / num_validations
-                    int(nan_frac * y), int(nan_frac * (y + 1))
                     val_df_train.iloc[
                         -1, int(nan_frac * y) : int(nan_frac * (y + 1))
                     ] = np.nan
@@ -903,7 +910,6 @@ def fit(
                     df_train=val_df_train,
                     df_test=val_df_test,
                     weights=current_weights,
-                    # model_count=model_count,
                     forecast_length=forecast_length,
                     frequency=frequency,
                     prediction_interval=prediction_interval,
@@ -1192,6 +1198,7 @@ def predict(
                     grouping_ids=self.grouping_ids,
                     random_seed=self.random_seed,
                     verbose=verbose,
+                    n_jobs=self.n_jobs,
                     template_cols=self.template_cols,
                 )
                 # convert categorical back to numeric
@@ -1223,6 +1230,7 @@ def predict(
                 grouping_ids=self.grouping_ids,
                 random_seed=self.random_seed,
                 verbose=verbose,
+                n_jobs=self.n_jobs,
                 template_cols=self.template_cols,
             )
             # convert categorical back to numeric
@@ -1446,7 +1454,8 @@ def horizontal_to_df(self):
             raise ValueError("No best_model. AutoTS .fit() needs to be run.")
         if self.best_model['Ensemble'].iloc[0] != 2:
             raise ValueError("Only works on horizontal ensemble type models.")
-        series = json.loads(self.best_model['ModelParameters'].iloc[0])['series']
+        ModelParameters = json.loads(self.best_model['ModelParameters'].iloc[0])
+        series = ModelParameters['series']
         series = pd.DataFrame.from_dict(series, orient="index").reset_index(drop=False)
         if series.shape[1] > 2:
             # for mosaic style ensembles, choose the mode model id
@@ -1463,6 +1472,22 @@ def horizontal_to_df(self):
             self.df_wide_numeric.mean().to_frame(), right_index=True, left_on="Series"
         )
         series.columns = ["Series", "ID", 'Model', "Volatility", "Mean"]
+        series['Transformers'] = series['ID'].copy()
+        series['FillNA'] = series['ID'].copy()
+        lookup = {}
+        na_lookup = {}
+        for k, v in ModelParameters['models'].items():
+            try:
+                trans_params = json.loads(v.get('TransformationParameters', '{}'))
+                lookup[k] = ",".join(trans_params.get('transformations', {}).values())
+                na_lookup[k] = trans_params.get('fillna', '')
+            except Exception:
+                lookup[k] = "None"
+                na_lookup[k] = "None"
+        series['Transformers'] = (
+            series['Transformers'].replace(lookup).replace("", "None")
+        )
+        series['FillNA'] = series['FillNA'].replace(na_lookup).replace("", "None")
         return series
 
     def mosaic_to_df(self):
@@ -1500,6 +1525,90 @@ def plot_horizontal(self, max_series: int = 20, **kwargs):
             'log(Volatility)'
         ].plot(style='o', **kwargs)
 
+    def plot_horizontal_transformers(
+        self, method="transformers", color_list=None, **kwargs
+    ):
+        """Simple plot to visualize transformers used.
+        Note this doesn't capture transformers nested in simple ensembles.
+
+        Args:
+            method (str): 'fillna' or 'transformers' - which to plot
+            color_list = list of colors to *sample* for bar colors. Can be names or hex.
+            **kwargs passed to pandas.plot()
+        """
+        series = self.horizontal_to_df()
+        if str(method).lower() == "fillna":
+            transformers = series['FillNA'].value_counts()
+        else:
+            transformers = pd.Series(
+                ",".join(series['Transformers']).split(",")
+            ).value_counts()
+        if color_list is None:
+            color_list = colors_list
+        colors = random.sample(color_list, transformers.shape[0])
+        # plot
+        transformers.plot(kind='bar', color=colors, **kwargs)
+
+    def plot_generation_loss(self, **kwargs):
+        """Plot improvement in accuracy over generations.
+        Note: this is only "one size fits all" accuracy and
+        doesn't account for the benefits seen for ensembling.
+
+        Args:
+            **kwargs passed to pd.DataFrame.plot()
+        """
+        for_gens = self.initial_results.model_results[
+            (self.initial_results.model_results['ValidationRound'] == 0)
+            & (self.initial_results.model_results['Ensemble'] < 1)
+        ]
+        for_gens.groupby("Generation")['Score'].min().cummin().plot(
+            ylabel="Lowest Score", **kwargs
+        )
+
+
+colors_list = [
+    '#FF00FF',
+    '#7FFFD4',
+    '#00FFFF',
+    '#F5DEB3',
+    '#FF6347',
+    '#8B008B',
+    '#696969',
+    '#FFC0CB',
+    '#C71585',
+    '#008080',
+    '#663399',
+    '#32CD32',
+    '#66CDAA',
+    '#A9A9A9',
+    '#2F4F4F',
+    '#FFDEAD',
+    '#800000',
+    '#FDF5E6',
+    '#F5F5F5',
+    '#F0FFF0',
+    '#87CEEB',
+    '#A52A2A',
+    '#90EE90',
+    '#7FFF00',
+    '#E9967A',
+    '#1E90FF',
+    '#FFF0F5',
+    '#ADD8E6',
+    '#008B8B',
+    '#FFF5EE',
+    '#00FA9A',
+    '#9370DB',
+    '#4682B4',
+    '#006400',
+    '#AFEEEE',
+    '#CD853F',
+    '#9400D3',
+    '#EE82EE',
+    '#00008B',
+    '#4B0082',
+]
+
 
 class AutoTSIntervals(object):
     """Autots looped to test multiple prediction intervals. Experimental.