streamlined some code

ryuta-yoshimatsu · ryuta-yoshimatsu · commit 7e3260995b27 · 2025-01-24T06:22:58.000+01:00
diff --git a/mmf_sa/Forecaster.py b/mmf_sa/Forecaster.py
@@ -114,12 +114,15 @@ def prepare_data_for_global_model(self, mode: str = None):
         """
         src_df = self.resolve_source("train_data")
         src_df, removed = DataQualityChecks(src_df, self.conf, self.spark).run()
+
+        # This block runs when preparing data for scoring
         if (mode == "scoring") \
                 and (self.conf["scoring_data"]) \
                 and (self.conf["scoring_data"] != self.conf["train_data"]):
             score_df = self.resolve_source("scoring_data")
             score_df = score_df.where(~col(self.conf["group_id"]).isin(removed))
             src_df = src_df.unionByName(score_df, allowMissingColumns=True)
+
         src_df = src_df.toPandas()
         return src_df, removed
 
@@ -323,9 +326,7 @@ def evaluate_global_model(self, model_conf):
                 model=final_model,
                 registered_model_name=f"{self.conf['model_output']}.{model_conf['name']}_{self.conf['use_case_name']}",
                 input_example=input_example,
-                run_id=self.run_id,
             )
-
             # Next, we train the model only with train_df and run detailed backtesting
             model = self.model_registry.get_model(model_name)
             model.fit(pd.concat([train_df]))
@@ -336,6 +337,8 @@ def evaluate_global_model(self, model_conf):
                 model_uri=model_info.model_uri,  # This model_uri is from the final model
                 write=True,
             )
+            mlflow.set_tag("run_id", self.run_id)
+            mlflow.set_tag("model_name", model.params["name"])
 
     def backtest_global_model(
         self,
@@ -423,13 +426,14 @@ def evaluate_foundation_model(self, model_conf):
         with mlflow.start_run(experiment_id=self.experiment_id) as run:
             model_name = model_conf["name"]
             model = self.model_registry.get_model(model_name)
-            # For now, only support registering chronos, moirai and moment models
+            hist_df, removed = self.prepare_data_for_global_model("evaluating")  # Reuse the same as global
+            train_df, val_df = self.split_df_train_val(hist_df)
+            input_example = train_df[train_df[self.conf['group_id']] == train_df[self.conf['group_id']] \
+                .unique()[0]].sort_values(by=[self.conf['date_col']])
             if model_conf["framework"] in ["Chronos", "Moirai", "Moment", "TimesFM"]:
                 model.register(
-                    registered_model_name=f"{self.conf['model_output']}.{model_conf['name']}_{self.conf['use_case_name']}"
+                    registered_model_name=f"{self.conf['model_output']}.{model_conf['name']}_{self.conf['use_case_name']}",
                 )
-            hist_df, removed = self.prepare_data_for_global_model("evaluating")  # Reuse the same as global
-            train_df, val_df = self.split_df_train_val(hist_df)
             model_uri = f"runs:/{run.info.run_id}/model"
             metrics = self.backtest_global_model(  # Reuse the same as global
                 model=model,
diff --git a/mmf_sa/models/abstract_model.py b/mmf_sa/models/abstract_model.py
@@ -47,7 +47,6 @@ def backtest(
             df: pd.DataFrame,
             start: pd.Timestamp,
             group_id: Union[str, int] = None,
-            stride: int = None,
             # backtest_retrain: bool = False,
             spark=None,
     ) -> pd.DataFrame:
@@ -58,19 +57,18 @@ def backtest(
             df (pd.DataFrame): A pandas DataFrame.
             start (pd.Timestamp): A pandas Timestamp object.
             group_id (Union[str, int], optional): A string or an integer specifying the group id. Default is None.
-            stride (int, optional): An integer specifying the stride. Default is None.
             spark (SparkSession, optional): A SparkSession object. Default is None.
         Returns: res_df (pd.DataFrame): A pandas DataFrame.
         """
-        if stride is None:
-            stride = int(self.params.get("stride", 7))
+        stride = int(self.params["stride"]) # Read in stride
         stride_offset = (
             pd.offsets.MonthEnd(stride)
             if self.freq == "M"
             else pd.DateOffset(days=stride)
         )
         df = df.copy().sort_values(by=[self.params["date_col"]])
-        end_date = df[self.params["date_col"]].max()
+        end_date = df[self.params["date_col"]].max() # Last date from the training data
+        # Offsets the timestamp: e.g. if it's in the middle of the month, makes it the end of the month
         curr_date = start + self.one_ts_offset
         # print("end_date = ", end_date)
 
diff --git a/mmf_sa/models/chronosforecast/ChronosPipeline.py b/mmf_sa/models/chronosforecast/ChronosPipeline.py
@@ -71,7 +71,8 @@ def prepare_data(self, df: pd.DataFrame, future: bool = False, spark=None) -> Da
             .agg(
                 collect_list(self.params.date_col).alias('ds'),
                 collect_list(self.params.target).alias('y'),
-            ))
+            )).withColumnRenamed(self.params.group_id, "unique_id")
+
         return df
 
     def predict(self,
@@ -110,37 +111,24 @@ def calculate_metrics(
         pred_df, model_pretrained = self.predict(hist_df, val_df, curr_date, spark)
         keys = pred_df[self.params["group_id"]].unique()
         metrics = []
-        if self.params["metric"] == "smape":
-            metric_name = "smape"
-        elif self.params["metric"] == "mape":
-            metric_name = "mape"
-        elif self.params["metric"] == "mae":
-            metric_name = "mae"
-        elif self.params["metric"] == "mse":
-            metric_name = "mse"
-        elif self.params["metric"] == "rmse":
-            metric_name = "rmse"
-        else:
+        metric_name = self.params["metric"]
+        if metric_name not in ("smape", "mape", "mae", "mse", "rmse"):
             raise Exception(f"Metric {self.params['metric']} not supported!")
         for key in keys:
             actual = val_df[val_df[self.params["group_id"]] == key][self.params["target"]].to_numpy()
             forecast = pred_df[pred_df[self.params["group_id"]] == key][self.params["target"]].to_numpy()[0]
+            # Mapping metric names to their respective classes
+            metric_classes = {
+                "smape": MeanAbsolutePercentageError(symmetric=True),
+                "mape": MeanAbsolutePercentageError(symmetric=False),
+                "mae": MeanAbsoluteError(),
+                "mse": MeanSquaredError(square_root=False),
+                "rmse": MeanSquaredError(square_root=True),
+            }
             try:
-                if metric_name == "smape":
-                    smape = MeanAbsolutePercentageError(symmetric=True)
-                    metric_value = smape(actual, forecast)
-                elif metric_name == "mape":
-                    mape = MeanAbsolutePercentageError(symmetric=False)
-                    metric_value = mape(actual, forecast)
-                elif metric_name == "mae":
-                    mae = MeanAbsoluteError()
-                    metric_value = mae(actual, forecast)
-                elif metric_name == "mse":
-                    mse = MeanSquaredError(square_root=False)
-                    metric_value = mse(actual, forecast)
-                elif metric_name == "rmse":
-                    rmse = MeanSquaredError(square_root=True)
-                    metric_value = rmse(actual, forecast)
+                if metric_name in metric_classes:
+                    metric_function = metric_classes[metric_name]
+                    metric_value = metric_function(actual, forecast)
                 metrics.extend(
                     [(
                         key,
@@ -240,6 +228,7 @@ def __init__(self, params):
         self.params = params
         self.repo = "amazon/chronos-bolt-small"
 
+
 class ChronosBoltBase(ChronosForecaster):
     def __init__(self, params):
         super().__init__(params)
@@ -268,4 +257,3 @@ def predict(self, context, input_data, params=None):
             prediction_length=self.prediction_length,
         )
         return forecast.numpy()
-
diff --git a/mmf_sa/models/moiraiforecast/MoiraiPipeline.py b/mmf_sa/models/moiraiforecast/MoiraiPipeline.py
@@ -69,7 +69,7 @@ def prepare_data(self, df: pd.DataFrame, future: bool = False, spark=None) -> Da
             .agg(
                 collect_list(self.params.date_col).alias('ds'),
                 collect_list(self.params.target).alias('y'),
-            ))
+            )).withColumnRenamed(self.params.group_id, "unique_id")
         return df
 
     def predict(self,
@@ -110,37 +110,24 @@ def calculate_metrics(
         pred_df, model_pretrained = self.predict(hist_df, val_df, curr_date, spark)
         keys = pred_df[self.params["group_id"]].unique()
         metrics = []
-        if self.params["metric"] == "smape":
-            metric_name = "smape"
-        elif self.params["metric"] == "mape":
-            metric_name = "mape"
-        elif self.params["metric"] == "mae":
-            metric_name = "mae"
-        elif self.params["metric"] == "mse":
-            metric_name = "mse"
-        elif self.params["metric"] == "rmse":
-            metric_name = "rmse"
-        else:
+        metric_name = self.params["metric"]
+        if metric_name not in ("smape", "mape", "mae", "mse", "rmse"):
             raise Exception(f"Metric {self.params['metric']} not supported!")
         for key in keys:
             actual = val_df[val_df[self.params["group_id"]] == key][self.params["target"]].to_numpy()
             forecast = pred_df[pred_df[self.params["group_id"]] == key][self.params["target"]].to_numpy()[0]
+            # Mapping metric names to their respective classes
+            metric_classes = {
+                "smape": MeanAbsolutePercentageError(symmetric=True),
+                "mape": MeanAbsolutePercentageError(symmetric=False),
+                "mae": MeanAbsoluteError(),
+                "mse": MeanSquaredError(square_root=False),
+                "rmse": MeanSquaredError(square_root=True),
+            }
             try:
-                if metric_name == "smape":
-                    smape = MeanAbsolutePercentageError(symmetric=True)
-                    metric_value = smape(actual, forecast)
-                elif metric_name == "mape":
-                    mape = MeanAbsolutePercentageError(symmetric=False)
-                    metric_value = mape(actual, forecast)
-                elif metric_name == "mae":
-                    mae = MeanAbsoluteError()
-                    metric_value = mae(actual, forecast)
-                elif metric_name == "mse":
-                    mse = MeanSquaredError(square_root=False)
-                    metric_value = mse(actual, forecast)
-                elif metric_name == "rmse":
-                    rmse = MeanSquaredError(square_root=True)
-                    metric_value = rmse(actual, forecast)
+                if metric_name in metric_classes:
+                    metric_function = metric_classes[metric_name]
+                    metric_value = metric_function(actual, forecast)
                 metrics.extend(
                     [(
                         key,
diff --git a/mmf_sa/models/momentforecast/MomentPipeline.py b/mmf_sa/models/momentforecast/MomentPipeline.py
@@ -115,37 +115,24 @@ def calculate_metrics(
         pred_df, model_pretrained = self.predict(hist_df, val_df, curr_date, spark)
         keys = pred_df[self.params["group_id"]].unique()
         metrics = []
-        if self.params["metric"] == "smape":
-            metric_name = "smape"
-        elif self.params["metric"] == "mape":
-            metric_name = "mape"
-        elif self.params["metric"] == "mae":
-            metric_name = "mae"
-        elif self.params["metric"] == "mse":
-            metric_name = "mse"
-        elif self.params["metric"] == "rmse":
-            metric_name = "rmse"
-        else:
+        metric_name = self.params["metric"]
+        if metric_name not in ("smape", "mape", "mae", "mse", "rmse"):
             raise Exception(f"Metric {self.params['metric']} not supported!")
         for key in keys:
             actual = val_df[val_df[self.params["group_id"]] == key][self.params["target"]].to_numpy()
             forecast = pred_df[pred_df[self.params["group_id"]] == key][self.params["target"]].to_numpy()[0]
+            # Mapping metric names to their respective classes
+            metric_classes = {
+                "smape": MeanAbsolutePercentageError(symmetric=True),
+                "mape": MeanAbsolutePercentageError(symmetric=False),
+                "mae": MeanAbsoluteError(),
+                "mse": MeanSquaredError(square_root=False),
+                "rmse": MeanSquaredError(square_root=True),
+            }
             try:
-                if metric_name == "smape":
-                    smape = MeanAbsolutePercentageError(symmetric=True)
-                    metric_value = smape(actual, forecast)
-                elif metric_name == "mape":
-                    mape = MeanAbsolutePercentageError(symmetric=False)
-                    metric_value = mape(actual, forecast)
-                elif metric_name == "mae":
-                    mae = MeanAbsoluteError()
-                    metric_value = mae(actual, forecast)
-                elif metric_name == "mse":
-                    mse = MeanSquaredError(square_root=False)
-                    metric_value = mse(actual, forecast)
-                elif metric_name == "rmse":
-                    rmse = MeanSquaredError(square_root=True)
-                    metric_value = rmse(actual, forecast)
+                if metric_name in metric_classes:
+                    metric_function = metric_classes[metric_name]
+                    metric_value = metric_function(actual, forecast)
                 metrics.extend(
                     [(
                         key,
diff --git a/mmf_sa/models/neuralforecast/NeuralForecastPipeline.py b/mmf_sa/models/neuralforecast/NeuralForecastPipeline.py
diff --git a/mmf_sa/models/timesfmforecast/TimesFMPipeline.py b/mmf_sa/models/timesfmforecast/TimesFMPipeline.py