From b95f99dd8c4ec13815befb6c344a0c1bbfef4d20 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Fri, 18 Aug 2023 10:01:19 +0300
Subject: [PATCH 01/43] setup preprocessing methods

---
 src/airflow/airflow-requirements.txt        |  3 +-
 src/airflow/airqo_etl_utils/bigquery_api.py | 25 +++++++-
 src/airflow/airqo_etl_utils/ml_utils.py     | 67 ++++++++++++++++-----
 src/airflow/dev-requirements.txt            |  3 +-
 src/airflow/requirements.txt                |  3 +-
 5 files changed, 81 insertions(+), 20 deletions(-)

diff --git a/src/airflow/airflow-requirements.txt b/src/airflow/airflow-requirements.txt
index 977410828f..48af8ae3aa 100644
--- a/src/airflow/airflow-requirements.txt
+++ b/src/airflow/airflow-requirements.txt
@@ -6,4 +6,5 @@ apache-airflow[sentry]
 lightgbm
 mlflow
 gcsfs
-pymongo
\ No newline at end of file
+pymongo
+category-encoders
\ No newline at end of file
diff --git a/src/airflow/airqo_etl_utils/bigquery_api.py b/src/airflow/airqo_etl_utils/bigquery_api.py
index 4a2ed54181..879f25db5b 100644
--- a/src/airflow/airqo_etl_utils/bigquery_api.py
+++ b/src/airflow/airqo_etl_utils/bigquery_api.py
@@ -619,7 +619,7 @@ def fetch_data(self, start_date_time: str, historical: bool = False):
         # historical is for the actual jobs, not training
         query = f"""
                 SELECT DISTINCT timestamp as created_at, {"site_id," if historical else ""} device_number, pm2_5_calibrated_value as pm2_5
-                FROM `{configuration.BIGQUERY_HOURLY_EVENTS_TABLE_PROD}`
+                FROM `{self.hourly_measurements_table_prod}`
                 WHERE DATE(timestamp) >= '{start_date_time}' and device_number IS NOT NULL 
                 ORDER BY created_at, device_number
         """
@@ -630,6 +630,29 @@ def fetch_data(self, start_date_time: str, historical: bool = False):
         df = self.client.query(f"{query}", job_config).result().to_dataframe()
         return df
 
+    def \
+            fetch_training_data(self, start_date_time:str,) -> pd.DataFrame:
+        query = f"""
+        SELECT DISTINCT 
+            t1.device_id, 
+            t1.timestamp,  
+            t1.site_id, 
+            t1.pm2_5_calibrated_value, 
+            t2.latitude, 
+            t2.longitude, 
+            t3.device_category 
+        FROM `{self.hourly_measurements_table_prod}` t1 
+        JOIN `{self.sites_table}` t2 on t1.site_id = t2.id 
+        JOIN `{self.devices_table}` t3 on t1.device_id = t3.device_id
+        WHERE date(t1.timestamp) >= '{start_date_time}' and t1.device_id IS NOT NULL 
+        ORDER BY device_id, timestamp"""
+
+        job_config = bigquery.QueryJobConfig()
+        job_config.use_query_cache = True
+    
+        df = self.client.query(f"{query}", job_config).result().to_dataframe()
+        return df
+
     @staticmethod
     def save_forecasts_to_bigquery(df, table):
         """saves the dataframes to the bigquery tables"""
diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index 8a7adb8e5a..380e8e58a3 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -6,6 +6,7 @@
 import numpy as np
 import pandas as pd
 import pymongo as pm
+from category_encoders import OneHotEncoder, CountEncoder
 from lightgbm import LGBMRegressor, early_stopping
 from scipy.stats import skew
 from sklearn.metrics import mean_squared_error
@@ -49,38 +50,36 @@ class ForecastUtils:
     ###FORECAST MODEL TRAINING UTILS####
     @staticmethod
     def preprocess_training_data(data, frequency):
-        data["created_at"] = pd.to_datetime(data["created_at"])
-        data["device_number"] = data["device_number"].astype(str)
-        data["pm2_5"] = data.groupby("device_number")["pm2_5"].transform(
+        data["timestamp"] = pd.to_datetime(data["timestamp"])
+        data["pm2_5"] = data.groupby("device_id")["pm2_5"].transform(
             lambda x: x.interpolate(method="linear", limit_direction="both")
         )
         if frequency == "daily":
             data = (
-                data.groupby(["device_number"])
-                .resample("D", on="created_at")
+                data.groupby(["device_id"])
+                .resample("D", on="timestamp")
                 .mean(numeric_only=True)
             )
             data.reset_index(inplace=True)
-            data["pm2_5"] = data.groupby("device_number")["pm2_5"].transform(
+            data["pm2_5"] = data.groupby("device_id")["pm2_5"].transform(
                 lambda x: x.interpolate(method="linear", limit_direction="both")
             )
-        data["device_number"] = data["device_number"].astype(int)
         data = data.dropna(subset=["pm2_5"])
         return data
 
     @staticmethod
-    def feature_eng_training_data(data, target_column, frequency):
+    def feature_eng_training_data(data, target_column, frequency): 
         def get_lag_features(df, target_col, freq):
-            df = df.sort_values(by=["device_number", "created_at"])
+            df = df.sort_values(by=["device_id", "timestamp"])
 
             if freq == "daily":
-                shifts = [1, 2]
+                shifts = [1, 2, 3, 7, 14]
                 for s in shifts:
-                    df[f"pm2_5_last_{s}_day"] = df.groupby(["device_number"])[
+                    df[f"pm2_5_last_{s}_day"] = df.groupby(["device_id"])[
                         target_col
                     ].shift(s)
 
-                shifts = [3, 7, 14, 30]
+                shifts = [2, 3, 7, 14]
                 functions = ["mean", "std", "max", "min"]
                 for s in shifts:
                     for f in functions:
@@ -127,10 +126,46 @@ def get_other_features(df_tmp, freq):
 
             print("Additional features added")
             return df_tmp
+        def encode_categorical_features(df_tmp):
+            #use count_encoding on site_id & device_id,also save the real values & what they've been encoded to in a dictionary
 
-        data["created_at"] = pd.to_datetime(data["created_at"])
+            #encode site_id
+            site_id_encoder = CountEncoder()
+            site_id_encoder.fit(df_tmp['site_id'])
+            df_tmp['site_id'] = site_id_encoder.transform(df_tmp['site_id'])
+            # site_id_encoder_dict = site_id_encoder.mapping[0]['mapping']
+
+            #encode device_id
+            device_id_encoder = CountEncoder()
+            device_id_encoder.fit(df_tmp['device_id'])
+            df_tmp['device_id'] = device_id_encoder.transform(df_tmp['device_id'])
+            # device_id_encoder_dict = device_id_encoder.mapping[0]['mapping']
+
+            device_category_encoder = OneHotEncoder(cols=['device_category'])
+            df_tmp = device_category_encoder.fit_transform(df_tmp)
+
+            return df_tmp
+
+        def get_time_and_cyclic_features(df):
+            attributes = ["year", "month", "day", "dayofweek", "hour"]
+            max_vals = [2023, 12, 31, 6, 23]
+            for a, m in zip(attributes, max_vals):
+                df[a] = df["timestamp"].dt.__getattribute__(a)
+                df[a + "_sin"] = np.sin(2 * np.pi * df[a] / m)
+                df[a + "_cos"] = np.cos(2 * np.pi * df[a] / m)
+        
+            df["week"] = df["timestamp"].dt.isocalendar().week
+            df["week_sin"] = np.sin(2 * np.pi * df["week"] / 52)
+            df["week_cos"] = np.cos(2 * np.pi * df["week"] / 52)
+            df.drop(columns=attributes, inplace=True)
+            return df
+
+
+        data["timestamp"] = pd.to_datetime(data["timestamp"])
         df_tmp = get_other_features(data, frequency)
         df_tmp = get_lag_features(df_tmp, target_column, frequency)
+        df_tmp = encode_categorical_features(df_tmp)
+        df_tmp = get_time_and_cyclic_features(df_tmp)
 
         return df_tmp
 
@@ -234,9 +269,9 @@ def train_and_save_hourly_forecast_model(train):  # separate code for hourly mod
 
     @staticmethod
     def train_and_save_daily_forecast_model(train):  # separate code for monthly model
-        train["created_at"] = pd.to_datetime(train["created_at"])
-        train = train.sort_values(by=["device_number", "created_at"])
-        features = [c for c in train.columns if c not in ["created_at", "pm2_5"]]
+        train["timestamp"] = pd.to_datetime(train["timestamp"])
+        train = train.sort_values(by=['device_id', 'timestamp'])
+        features = [c for c in train.columns if c not in ["timestamp", "pm2_5"]]
         print(features)
         target_col = "pm2_5"
         train_data, test_data = pd.DataFrame(), pd.DataFrame()
diff --git a/src/airflow/dev-requirements.txt b/src/airflow/dev-requirements.txt
index 81c23b0562..59d0561bea 100644
--- a/src/airflow/dev-requirements.txt
+++ b/src/airflow/dev-requirements.txt
@@ -18,4 +18,5 @@ mlflow
 lightgbm
 gcsfs
 pymongo
-pytest
\ No newline at end of file
+pytest
+category_encoders
\ No newline at end of file
diff --git a/src/airflow/requirements.txt b/src/airflow/requirements.txt
index c79865c3cf..947c051adc 100644
--- a/src/airflow/requirements.txt
+++ b/src/airflow/requirements.txt
@@ -18,4 +18,5 @@ gcsfs
 pymongo~=4.4.1
 
 pytest~=7.4.0
-scipy~=1.11.1
\ No newline at end of file
+scipy~=1.11.1
+category_encoders
\ No newline at end of file

From 9d541f1e15ea548fc8c90c3ba8a2ba990db1682d Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Sun, 20 Aug 2023 23:30:35 +0300
Subject: [PATCH 02/43] training job code setup

---
 src/airflow/airqo_etl_utils/air_beam_api.py   |  58 +--
 src/airflow/airqo_etl_utils/airnow_api.py     |   8 +-
 src/airflow/airqo_etl_utils/airqo_api.py      |  19 +-
 src/airflow/airqo_etl_utils/bigquery_api.py   |   8 +-
 src/airflow/airqo_etl_utils/config.py         |   1 +
 src/airflow/airqo_etl_utils/ml_utils.py       | 434 ++++++++----------
 src/airflow/airqo_etl_utils/plume_labs_api.py |  17 +-
 src/airflow/airqo_etl_utils/purple_air_api.py |  18 +-
 src/airflow/airqo_etl_utils/tahmo_api.py      |  21 +-
 9 files changed, 278 insertions(+), 306 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/air_beam_api.py b/src/airflow/airqo_etl_utils/air_beam_api.py
index 57adacfcf9..657d69dbf4 100644
--- a/src/airflow/airqo_etl_utils/air_beam_api.py
+++ b/src/airflow/airqo_etl_utils/air_beam_api.py
@@ -24,25 +24,25 @@ def get_stream_ids(
         username: str,
         pollutant: str,
     ):
-        params={
-                "q": json.dumps(
-                    {
-                        "time_from": int(start_date_time.timestamp()),
-                        "time_to": int(end_date_time.timestamp()),
-                        "tags": "",
-                        "usernames": username,
-                        "west": 10.581214853439886,
-                        "east": 38.08577769782265,
-                        "south": -36.799337832603314,
-                        "north": -19.260169583742446,
-                        "limit": 100,
-                        "offset": 0,
-                        "sensor_name": f"airbeam3-{pollutant}",
-                        "measurement_type": "Particulate Matter",
-                        "unit_symbol": "µg/m³",
-                    }
-                )
-            }
+        params = {
+            "q": json.dumps(
+                {
+                    "time_from": int(start_date_time.timestamp()),
+                    "time_to": int(end_date_time.timestamp()),
+                    "tags": "",
+                    "usernames": username,
+                    "west": 10.581214853439886,
+                    "east": 38.08577769782265,
+                    "south": -36.799337832603314,
+                    "north": -19.260169583742446,
+                    "limit": 100,
+                    "offset": 0,
+                    "sensor_name": f"airbeam3-{pollutant}",
+                    "measurement_type": "Particulate Matter",
+                    "unit_symbol": "µg/m³",
+                }
+            )
+        }
         request = self.__request(
             endpoint=f"mobile/sessions.json",
             params=params,
@@ -65,32 +65,32 @@ def get_measurements(
             endpoint=f"measurements.json",
             params=params,
         )
-        
-    def __request(self, endpoint, params):
 
+    def __request(self, endpoint, params):
         url = f"{self.AIR_BEAM_BASE_URL}{endpoint}"
         retry_strategy = Retry(
             total=5,
             backoff_factor=5,
         )
-        
+
         http = urllib3.PoolManager(retries=retry_strategy)
-        
+
         try:
             response = http.request(
-                "GET", 
-                url, 
-                fields=params,)
-            
+                "GET",
+                url,
+                fields=params,
+            )
+
             response_data = response.data
             print(response._request_url)
-            
+
             if response.status == 200:
                 return json.loads(response_data)
             else:
                 Utils.handle_api_error(response)
                 return None
-            
+
         except urllib3.exceptions.HTTPError as e:
             print(f"HTTPError: {e}")
             return None
diff --git a/src/airflow/airqo_etl_utils/airnow_api.py b/src/airflow/airqo_etl_utils/airnow_api.py
index 851afe89cb..a1e65b8189 100644
--- a/src/airflow/airqo_etl_utils/airnow_api.py
+++ b/src/airflow/airqo_etl_utils/airnow_api.py
@@ -56,20 +56,20 @@ def __request(self, endpoint, params, api_key):
             total=5,
             backoff_factor=5,
         )
-        
+
         http = urllib3.PoolManager(retries=retry_strategy)
-        
+
         try:
             response = http.request("GET", url, fields=params)
             response_data = response.data
             print(response._request_url)
-            
+
             if response.status == 200:
                 return json.loads(response_data)
             else:
                 Utils.handle_api_error(response)
                 return None
-            
+
         except urllib3.exceptions.HTTPError as e:
             print(f"HTTPError: {e}")
             return None
diff --git a/src/airflow/airqo_etl_utils/airqo_api.py b/src/airflow/airqo_etl_utils/airqo_api.py
index 876f630a9d..e80cd7bf39 100644
--- a/src/airflow/airqo_etl_utils/airqo_api.py
+++ b/src/airflow/airqo_etl_utils/airqo_api.py
@@ -322,7 +322,7 @@ def __request(self, endpoint, params=None, body=None, method=None, base_url=None
         params.update({"token": self.AIRQO_API_TOKEN})
 
         retry_strategy = Retry(
-            total=5,       
+            total=5,
             backoff_factor=5,
         )
 
@@ -338,21 +338,21 @@ def __request(self, endpoint, params=None, body=None, method=None, base_url=None
                 encoded_args = urlencode(params)
                 url = url + "?" + encoded_args
                 response = http.request(
-                    "PUT", 
+                    "PUT",
                     url,
-                    headers=headers, 
-                    body=simplejson.dumps(body, ignore_nan=True)
-                    )
+                    headers=headers,
+                    body=simplejson.dumps(body, ignore_nan=True),
+                )
             elif method == "post":
                 headers["Content-Type"] = "application/json"
                 encoded_args = urlencode(params)
                 url = url + "?" + encoded_args
                 response = http.request(
-                    "POST", 
+                    "POST",
                     url,
-                    headers=headers, 
-                    body=simplejson.dumps(body, ignore_nan=True)
-                    )
+                    headers=headers,
+                    body=simplejson.dumps(body, ignore_nan=True),
+                )
             else:
                 handle_api_error("Invalid")
                 return None
@@ -368,4 +368,3 @@ def __request(self, endpoint, params=None, body=None, method=None, base_url=None
         except urllib3.exceptions.HTTPError as e:
             print(f"HTTPError: {e}")
             return None
-
diff --git a/src/airflow/airqo_etl_utils/bigquery_api.py b/src/airflow/airqo_etl_utils/bigquery_api.py
index 879f25db5b..82212655e2 100644
--- a/src/airflow/airqo_etl_utils/bigquery_api.py
+++ b/src/airflow/airqo_etl_utils/bigquery_api.py
@@ -630,8 +630,10 @@ def fetch_data(self, start_date_time: str, historical: bool = False):
         df = self.client.query(f"{query}", job_config).result().to_dataframe()
         return df
 
-    def \
-            fetch_training_data(self, start_date_time:str,) -> pd.DataFrame:
+    def fetch_training_data(
+        self,
+        start_date_time: str,
+    ) -> pd.DataFrame:
         query = f"""
         SELECT DISTINCT 
             t1.device_id, 
@@ -649,7 +651,7 @@ def \
 
         job_config = bigquery.QueryJobConfig()
         job_config.use_query_cache = True
-    
+
         df = self.client.query(f"{query}", job_config).result().to_dataframe()
         return df
 
diff --git a/src/airflow/airqo_etl_utils/config.py b/src/airflow/airqo_etl_utils/config.py
index 065efecdb5..9176da5f2e 100644
--- a/src/airflow/airqo_etl_utils/config.py
+++ b/src/airflow/airqo_etl_utils/config.py
@@ -170,6 +170,7 @@ class Config:
     FORECAST_MODELS_BUCKET = os.getenv("FORECAST_MODELS_BUCKET")
     MONGO_URI = os.getenv("MONGO_URI")
     MONGO_DATABASE_NAME = os.getenv("MONGO_DATABASE_NAME")
+    ENVIRONMENT = os.getenv("ENVIRONMENT")
 
 
 configuration = Config()
diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index 380e8e58a3..ef9af89e76 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -1,13 +1,14 @@
+import json
 from datetime import datetime
 
 import gcsfs
 import joblib
 import mlflow
 import numpy as np
+import optuna
 import pandas as pd
 import pymongo as pm
-from category_encoders import OneHotEncoder, CountEncoder
-from lightgbm import LGBMRegressor, early_stopping
+from lightgbm import LGBMRegressor
 from scipy.stats import skew
 from sklearn.metrics import mean_squared_error
 
@@ -16,6 +17,7 @@
 fixed_columns = ["site_id"]
 project_id = configuration.GOOGLE_CLOUD_PROJECT_ID
 bucket = configuration.FORECAST_MODELS_BUCKET
+environment = configuration.ENVIRONMENT
 
 
 def get_trained_model_from_gcs(project_name, bucket_name, source_blob_name):
@@ -30,8 +32,6 @@ def upload_trained_model_to_gcs(
     trained_model, project_name, bucket_name, source_blob_name
 ):
     fs = gcsfs.GCSFileSystem(project=project_name)
-
-    # backup previous model
     try:
         fs.rename(
             f"{bucket_name}/{source_blob_name}",
@@ -46,8 +46,22 @@ def upload_trained_model_to_gcs(
         job = joblib.dump(trained_model, handle)
 
 
+def upload_mapping_to_gcs(mapping_dict, project_name, bucket_name, source_blob_name):
+    fs = gcsfs.GCSFileSystem(project=project_name)
+    mapping_dict = json.dumps(mapping_dict)
+    with fs.open(bucket_name + "/" + source_blob_name, "w") as f:
+        f.write(mapping_dict)
+
+
+def get_mapping_from_gcs(project_name, bucket_name, source_blob_name):
+    fs = gcsfs.GCSFileSystem(project=project_name)
+    with fs.open(bucket_name + "/" + source_blob_name, "r") as f:
+        mapping_dict = json.load(f)
+    return mapping_dict
+
+
 class ForecastUtils:
-    ###FORECAST MODEL TRAINING UTILS####
+    # FORECAST MODEL TRAINING UTILS
     @staticmethod
     def preprocess_training_data(data, frequency):
         data["timestamp"] = pd.to_datetime(data["timestamp"])
@@ -68,7 +82,7 @@ def preprocess_training_data(data, frequency):
         return data
 
     @staticmethod
-    def feature_eng_training_data(data, target_column, frequency): 
+    def feature_eng_training_data(data, target_column, frequency):
         def get_lag_features(df, target_col, freq):
             df = df.sort_values(by=["device_id", "timestamp"])
 
@@ -84,28 +98,24 @@ def get_lag_features(df, target_col, freq):
                 for s in shifts:
                     for f in functions:
                         df[f"pm2_5_{f}_{s}_day"] = (
-                            df.groupby(["device_number"])[target_col]
+                            df.groupby(["device_id"])[target_col]
                             .shift(1)
                             .rolling(s)
                             .agg(f)
                         )
             elif freq == "hourly":
-                shifts = [
-                    1,
-                    2,
-                ]  # TODO: Review to increase these both in training and the actual job
+                shifts = [1, 2, 6, 12]
                 for s in shifts:
-                    df[f"pm2_5_last_{s}_hour"] = df.groupby(["device_number"])[
+                    df[f"pm2_5_last_{s}_hour"] = df.groupby(["device_id"])[
                         target_col
                     ].shift(s)
 
-                # lag features
-                shifts = [6, 12, 24, 48]
+                shifts = [3, 6, 12, 24]
                 functions = ["mean", "std", "median", "skew"]
                 for s in shifts:
                     for f in functions:
                         df[f"pm2_5_{f}_{s}_hour"] = (
-                            df.groupby(["device_number"])[target_col]
+                            df.groupby(["device_id"])[target_col]
                             .shift(1)
                             .rolling(s)
                             .agg(f)
@@ -115,270 +125,232 @@ def get_lag_features(df, target_col, freq):
 
             return df
 
-        def get_other_features(df_tmp, freq):
-            # TODO: Experiment on impact of features
-            attributes = ["year", "month", "day", "dayofweek"]
-            if freq == "hourly":
-                attributes.extend(["hour", "minute"])
-            for a in attributes:
-                df_tmp[a] = df_tmp["created_at"].dt.__getattribute__(a)
-            df_tmp["week"] = df_tmp["created_at"].dt.isocalendar().week.astype(int)
-
-            print("Additional features added")
-            return df_tmp
-        def encode_categorical_features(df_tmp):
-            #use count_encoding on site_id & device_id,also save the real values & what they've been encoded to in a dictionary
-
-            #encode site_id
-            site_id_encoder = CountEncoder()
-            site_id_encoder.fit(df_tmp['site_id'])
-            df_tmp['site_id'] = site_id_encoder.transform(df_tmp['site_id'])
-            # site_id_encoder_dict = site_id_encoder.mapping[0]['mapping']
-
-            #encode device_id
-            device_id_encoder = CountEncoder()
-            device_id_encoder.fit(df_tmp['device_id'])
-            df_tmp['device_id'] = device_id_encoder.transform(df_tmp['device_id'])
-            # device_id_encoder_dict = device_id_encoder.mapping[0]['mapping']
-
-            device_category_encoder = OneHotEncoder(cols=['device_category'])
-            df_tmp = device_category_encoder.fit_transform(df_tmp)
+        def count_encode_categorical_features(df):
+            device_id_mappings, site_id_mappings, device_category_mappings = {}, {}, {}
+            for col in ["device_id", "site_id", "device_category"]:
+                counts = df[col].value_counts()
+                count_dict = dict(zip(counts.index, counts.values))
+                if col == "device_id":
+                    device_id_mappings = count_dict
+                elif col == "site_id":
+                    site_id_mappings = count_dict
+                elif col == "device_category":
+                    device_category_mappings = count_dict
+                df[f"{col}"] = df[col].map(count_dict)
+            mappings = [device_id_mappings, site_id_mappings, device_category_mappings]
+            for mapping in mappings:
+                upload_mapping_to_gcs(mapping, project_id, bucket, f"{mapping}.json")
 
-            return df_tmp
+            return df
 
-        def get_time_and_cyclic_features(df):
+        def get_time_and_cyclic_features(df, freq):
             attributes = ["year", "month", "day", "dayofweek", "hour"]
             max_vals = [2023, 12, 31, 6, 23]
+            if freq == "hourly":
+                attributes.append("minute")
+                max_vals.append(59)
             for a, m in zip(attributes, max_vals):
                 df[a] = df["timestamp"].dt.__getattribute__(a)
                 df[a + "_sin"] = np.sin(2 * np.pi * df[a] / m)
                 df[a + "_cos"] = np.cos(2 * np.pi * df[a] / m)
-        
+
             df["week"] = df["timestamp"].dt.isocalendar().week
             df["week_sin"] = np.sin(2 * np.pi * df["week"] / 52)
             df["week_cos"] = np.cos(2 * np.pi * df["week"] / 52)
             df.drop(columns=attributes, inplace=True)
             return df
 
-
         data["timestamp"] = pd.to_datetime(data["timestamp"])
-        df_tmp = get_other_features(data, frequency)
-        df_tmp = get_lag_features(df_tmp, target_column, frequency)
-        df_tmp = encode_categorical_features(df_tmp)
-        df_tmp = get_time_and_cyclic_features(df_tmp)
+        df_tmp = get_lag_features(data, target_column, frequency)
+        df_tmp = count_encode_categorical_features(df_tmp)
+        df_tmp = get_time_and_cyclic_features(df_tmp, frequency)
 
         return df_tmp
 
     @staticmethod
-    def train_and_save_hourly_forecast_model(train):  # separate code for hourly model
+    def train_and_save_forecast_models(train, frequency):
         """
         Perform the actual training for hourly data
         """
-        train["created_at"] = pd.to_datetime(train["created_at"])
-        train = train.sort_values(by=["device_number", "created_at"])
-        features = [c for c in train.columns if c not in ["created_at", "pm2_5"]]
+        train["timestamp"] = pd.to_datetime(train["timestamp"])
+        train = train.sort_values(by=["device_id", "timestamp"])
+        features = [c for c in train.columns if c not in ["timestamp", "pm2_5"]]
         print(features)
         target_col = "pm2_5"
-        train_data, test_data = pd.DataFrame(), pd.DataFrame()
-        for device_number in train["device_number"].unique():
-            device_df = train[train["device_number"] == device_number]
-            device_df = device_df.sort_values(by="created_at")
-            months = device_df["created_at"].dt.month.unique()
-            train_months = months[:4]
-            test_months = months[4:]
-            train_df = device_df[device_df["created_at"].dt.month.isin(train_months)]
-            test_df = device_df[device_df["created_at"].dt.month.isin(test_months)]
+        train_data, validation_data, test_data = (
+            pd.DataFrame(),
+            pd.DataFrame(),
+            pd.DataFrame(),
+        )
+        for device in train["device_id"].unique():
+            device_df = train[train["device_id"] == device]
+            device_df = device_df.sort_values(by="timestamp")
+            months = device_df["timestamp"].dt.month.unique()
+            train_months = val_months = test_months = 0
+            if frequency == "hourly":
+                train_months = months[:8]
+                val_months = months[8:9]
+                test_months = months[9:]
+            elif frequency == "daily":
+                train_months = months[:10]
+                val_months = months[10:11]
+                test_months = months[11:]
+
+            train_df = device_df[device_df["timestamp"].dt.month.isin(train_months)]
+            val_df = device_df[device_df["timestamp"].dt.month.isin(val_months)]
+            test_df = device_df[device_df["timestamp"].dt.month.isin(test_months)]
             train_data = pd.concat([train_data, train_df])
+            validation_data = pd.concat([validation_data, val_df])
             test_data = pd.concat([test_data, test_df])
 
-        train_data.drop(columns=["created_at"], axis=1, inplace=True)
-        test_data.drop(columns=["created_at"], axis=1, inplace=True)
+        train_data.drop(columns=["timestamp"], axis=1, inplace=True)
+        validation_data.drop(columns=["timestamp"], axis=1, inplace=True)
+        test_data.drop(columns=["timestamp"], axis=1, inplace=True)
 
-        train_target, test_target = train_data[target_col], test_data[target_col]
+        train_target, validation_target, test_target = (
+            train_data[target_col],
+            validation_data[target_col],
+            test_data[target_col],
+        )
 
-        with mlflow.start_run():
-            print("Model training started.....")
-            n_estimators = 5000
-            learning_rate = 0.05
-            colsample_bytree = 0.4
-            reg_alpha = 0
-            reg_lambda = 1
-            max_depth = 1
-            random_state = 1
+        mlflow.set_tracking_uri(configuration.MLFLOW_TRACKING_URI)
+        mlflow.set_experiment(f"LGBM_{frequency}_forecast_model_{environment}")
+        registered_model_name = f"LGBM_{frequency}_forecast_model_{environment}"
 
-            clf = LGBMRegressor(
-                n_estimators=n_estimators,
-                learning_rate=learning_rate,
-                colsample_bytree=colsample_bytree,
-                reg_alpha=reg_alpha,
-                reg_lambda=reg_lambda,
-                max_depth=max_depth,
-                random_state=random_state,
-            )
+        mlflow.lightgbm.autolog(registered_model_name=registered_model_name)
 
-            clf.fit(
-                train_data[features],
-                train_target,
-                eval_set=[(test_data[features], test_target)],
-                callbacks=[early_stopping(stopping_rounds=150)],
-                eval_metric="rmse",
-            )
-            print("Model training completed.....")
-
-            # Log parameters
-            mlflow.log_param("n_estimators", n_estimators)
-            mlflow.log_param("learning_rate", learning_rate)
-            mlflow.log_param("colsample_bytree", colsample_bytree)
-            mlflow.log_param("reg_alpha", reg_alpha)
-            mlflow.log_param("reg_lamba", reg_lambda)
-            mlflow.log_param("max_depth", max_depth)
-            mlflow.log_param("random_state", random_state)
-
-            # Log moder
-            mlflow.sklearn.log_model(
-                sk_model=clf,
-                artifact_path="hourly_forecast_model",
-                registered_model_name=f"LGBM_hourly_forecast_model_development",
-            )
-
-            print("Being model validation.....")
-
-            val_preds = clf.predict(test_data[features])
-            rmse_val = mean_squared_error(test_data[target_col], val_preds) ** 0.5
+        sampler = optuna.samplers.TPESampler()
+        pruner = optuna.pruners.SuccessiveHalvingPruner(
+            min_resource=10, reduction_factor=2, min_early_stopping_rate=0
+        )
+        study = optuna.create_study(
+            direction="minimize", study_name="LGBM", sampler=sampler, pruner=pruner
+        )
 
-            print("Model validation completed.....")
-            print(f"Validation RMSE is {rmse_val}")
+        def objective(trial):
+            param_grid = {
+                "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.1, 1),
+                "reg_alpha": trial.suggest_uniform("reg_alpha", 0, 10),
+                "reg_lambda": trial.suggest_uniform("reg_lambda", 0, 10),
+                "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
+                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
+                "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
+                "max_depth": trial.suggest_int("max_depth", 3, 12),
+                "min_split_gain": trial.suggest_float("min_gain_to_split", 0, 15),
+            }
 
-            # Log metrics
-            mlflow.log_metric("VAL_RMSE", rmse_val)
+            score = 0
+            for step in range(4):
+                lgb_reg = LGBMRegressor(
+                    objective="regression",
+                    n_jobs=2,
+                    random_state=42,
+                    **param_grid,
+                    verbosity=2,
+                )
+                lgb_reg.fit(
+                    train_data[features],
+                    train_target,
+                    categorical_feature=["device_id", "site_id", "device_category"],
+                    eval_set=[(test_data[features], test_target)],
+                    eval_metric="rmse",
+                )
 
-            best_iter = clf.best_iteration_
-            clf = LGBMRegressor(
-                n_estimators=best_iter,
-                learning_rate=0.05,
-                colsample_bytree=0.4,
-                reg_alpha=2,
-                reg_lambda=1,
-                max_depth=-1,
-                random_state=1,
-                verbosity=2,
-            )
-            train["device_number"] = train["device_number"].astype(int)
-            clf.fit(train[features], train[target_col])
-        upload_trained_model_to_gcs(clf, project_id, bucket, "hourly_forecast_model")
+                val_preds = lgb_reg.predict(validation_data[features])
+                score = mean_squared_error(validation_target, val_preds)
+                if trial.should_prune():
+                    raise optuna.TrialPruned()
 
-    @staticmethod
-    def train_and_save_daily_forecast_model(train):  # separate code for monthly model
-        train["timestamp"] = pd.to_datetime(train["timestamp"])
-        train = train.sort_values(by=['device_id', 'timestamp'])
-        features = [c for c in train.columns if c not in ["timestamp", "pm2_5"]]
-        print(features)
-        target_col = "pm2_5"
-        train_data, test_data = pd.DataFrame(), pd.DataFrame()
-
-        for device_number in train["device_number"].unique():
-            device_df = train[train["device_number"] == device_number]
-            device_df = device_df.sort_values(by="created_at")
-            months = device_df["created_at"].dt.month.unique()
-            train_months = months[:8]
-            test_months = months[8:]
-            train_df = device_df[device_df["created_at"].dt.month.isin(train_months)]
-            test_df = device_df[device_df["created_at"].dt.month.isin(test_months)]
-            train_data = pd.concat([train_data, train_df])
-            test_data = pd.concat([test_data, test_df])
+            return score
 
-        train_data.drop(columns=["created_at"], axis=1, inplace=True)
-        test_data.drop(columns=["created_at"], axis=1, inplace=True)
+        study.optimize(objective, n_trials=150)
 
-        train_target, test_target = train_data[target_col], test_data[target_col]
         with mlflow.start_run():
-            print("Model training started.....")
-            n_estimators = 5000
-            learning_rate = 0.05
-            colsample_bytree = 0.4
-            reg_alpha = 0
-            reg_lambda = 1
-            max_depth = 1
-            random_state = 1
-
+            best_params = study.best_params
+            print(f"Best params are {best_params}")
             clf = LGBMRegressor(
-                n_estimators=n_estimators,
-                learning_rate=learning_rate,
-                colsample_bytree=colsample_bytree,
-                reg_alpha=reg_alpha,
-                reg_lambda=reg_lambda,
-                max_depth=max_depth,
-                random_state=random_state,
+                n_estimators=best_params["n_estimators"],
+                learning_rate=best_params["learning_rate"],
+                colsample_bytree=best_params["colsample_bytree"],
+                reg_alpha=best_params["reg_alpha"],
+                reg_lambda=best_params["reg_lambda"],
+                max_depth=best_params["max_depth"],
+                random_state=42,
+                verbosity=2,
             )
 
             clf.fit(
                 train_data[features],
                 train_target,
                 eval_set=[(test_data[features], test_target)],
-                callbacks=[early_stopping(stopping_rounds=150)],
                 eval_metric="rmse",
-            )
-            print("Model training completed.....")
-
-            # Log parameters
-            mlflow.log_param("n_estimators", n_estimators)
-            mlflow.log_param("learning_rate", learning_rate)
-            mlflow.log_param("colsample_bytree", colsample_bytree)
-            mlflow.log_param("reg_alpha", reg_alpha)
-            mlflow.log_param("reg_lamba", reg_lambda)
-            mlflow.log_param("max_depth", max_depth)
-            mlflow.log_param("random_state", random_state)
-
-            # Log model
-            mlflow.sklearn.log_model(
-                sk_model=clf,
-                artifact_path="daily_forecast_model",
-                registered_model_name=f"LGBM_daily_forecast_model_development",
+                categorical_feature=["device_id", "site_id", "device_category"],
             )
 
-            # model validation
-            print("Being model validation.....")
+            # train quantile regression models for 0.025 and 0.975 quantiles
+            clf_025 = LGBMRegressor(
+                n_estimators=best_params["n_estimators"],
+                learning_rate=best_params["learning_rate"],
+                colsample_bytree=best_params["colsample_bytree"],
+                reg_alpha=best_params["reg_alpha"],
+                reg_lambda=best_params["reg_lambda"],
+                max_depth=best_params["max_depth"],
+                random_state=42,
+                verbosity=2,
+                objective="quantile",
+                alpha=0.025,
+                metric="quantile",
+            )
 
-            val_preds = clf.predict(test_data[features])
-            rmse_val = mean_squared_error(test_data[target_col], val_preds) ** 0.5
+            clf_025.fit(
+                train_data[features],
+                train_target,
+                eval_set=[(test_data[features], test_target)],
+                categorical_feature=["device_id", "site_id", "device_category"],
+            )
 
-            print("Model validation completed.....")
-            print(f"Validation RMSE is {rmse_val}")
+            clf_975 = LGBMRegressor(
+                n_estimators=best_params["n_estimators"],
+                learning_rate=best_params["learning_rate"],
+                colsample_bytree=best_params["colsample_bytree"],
+                reg_alpha=best_params["reg_alpha"],
+                reg_lambda=best_params["reg_lambda"],
+                max_depth=best_params["max_depth"],
+                random_state=42,
+                verbosity=2,
+                objective="quantile",
+                alpha=0.975,
+                metric="quantile",
+            )
 
-            # Log metrics
-            mlflow.log_metric("VAL_RMSE", rmse_val)
+            clf_975.fit(
+                train_data[features],
+                train_target,
+                eval_set=[(test_data[features], test_target)],
+                categorical_feature=["device_id", "site_id", "device_category"],
+            )
 
-            best_iter = clf.best_iteration_
-            clf = LGBMRegressor(
-                n_estimators=best_iter,
-                learning_rate=0.05,
-                colsample_bytree=0.4,
-                reg_alpha=2,
-                reg_lambda=1,
-                max_depth=-1,
-                random_state=1,
+            upload_trained_model_to_gcs(
+                clf, project_id, bucket, "hourly_forecast_model.pkl"
             )
-            clf.fit(train[features], train[target_col])
-        upload_trained_model_to_gcs(clf, project_id, bucket, "daily_forecast_model.pkl")
-        print("Model saved successfully")
 
     #### FORECAST JOB UTILS ####
 
     @staticmethod
     def preprocess_historical_data(data, frequency):
-        data["created_at"] = pd.to_datetime(data["created_at"])
+        data["timestamp"] = pd.to_datetime(data["timestamp"])
         data["device_number"] = data["device_number"].astype(str)
         data["pm2_5"] = data.groupby(fixed_columns + ["device_number"])[
             "pm2_5"
         ].transform(lambda x: x.interpolate(method="linear", limit_direction="both"))
         if frequency == "hourly":
             data.sort_values(
-                by=fixed_columns + ["device_number", "created_at"], inplace=True
+                by=fixed_columns + ["device_number", "timestamp"], inplace=True
             )
         elif frequency == "daily":
             data = (
                 data.groupby(fixed_columns + ["device_number"])
-                .resample("D", on="created_at")
+                .resample("D", on="timestamp")
                 .mean(numeric_only=True)
             )
             data.reset_index(inplace=True)
@@ -388,7 +360,7 @@ def preprocess_historical_data(data, frequency):
                 lambda x: x.interpolate(method="linear", limit_direction="both")
             )
             data.sort_values(
-                by=fixed_columns + ["device_number", "created_at"], inplace=True
+                by=fixed_columns + ["device_number", "timestamp"], inplace=True
             )
         else:
             raise ValueError("Invalid frequency argument")
@@ -398,8 +370,8 @@ def preprocess_historical_data(data, frequency):
 
     @staticmethod
     def get_lag_features(df_tmp, TARGET_COL, frequency):
-        df_tmp["created_at"] = pd.to_datetime(df_tmp["created_at"])
-        df_tmp = df_tmp.sort_values(by=fixed_columns + ["device_number", "created_at"])
+        df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"])
+        df_tmp = df_tmp.sort_values(by=fixed_columns + ["device_number", "timestamp"])
         if frequency == "hourly":
             shifts = [1, 2]
             for s in shifts:
@@ -440,27 +412,27 @@ def get_lag_features(df_tmp, TARGET_COL, frequency):
 
     @staticmethod
     def get_time_features(df_tmp, frequency):
-        df_tmp["created_at"] = pd.to_datetime(df_tmp["created_at"])
+        df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"])
         attributes = ["year", "month", "day", "dayofweek"]
         if frequency == "hourly":
             attributes.extend(["hour", "minute"])
         for a in attributes:
-            df_tmp[a] = df_tmp["created_at"].dt.__getattribute__(a)
+            df_tmp[a] = df_tmp["timestamp"].dt.__getattribute__(a)
 
-        df_tmp["week"] = df_tmp["created_at"].dt.isocalendar().week
+        df_tmp["week"] = df_tmp["timestamp"].dt.isocalendar().week
         print("Adding other features")
         return df_tmp
 
     @staticmethod
     def generate_hourly_forecasts(data, project_name, bucket_name, source_blob_name):
-        data["created_at"] = pd.to_datetime(data["created_at"])
+        data["timestamp"] = pd.to_datetime(data["timestamp"])
 
         def get_new_row(df, device1, model):
             last_row = df[df["device_number"] == device1].iloc[-1]
             new_row = pd.Series(index=last_row.index, dtype="float64")
             for i in fixed_columns:
                 new_row[i] = last_row[i]
-            new_row["created_at"] = last_row["created_at"] + pd.Timedelta(hours=1)
+            new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(hours=1)
             new_row["device_number"] = device1
             new_row[f"pm2_5_last_1_hour"] = last_row["pm2_5"]
             new_row[f"pm2_5_last_2_hour"] = last_row[f"pm2_5_last_{1}_hour"]
@@ -498,11 +470,11 @@ def get_new_row(df, device1, model):
 
             attributes = ["year", "month", "day", "dayofweek", "hour", "minute"]
             for a in attributes:
-                new_row[a] = new_row["created_at"].__getattribute__(a)
-                new_row["week"] = new_row["created_at"].isocalendar().week
+                new_row[a] = new_row["timestamp"].__getattribute__(a)
+                new_row["week"] = new_row["timestamp"].isocalendar().week
 
             new_row["pm2_5"] = model.predict(
-                new_row.drop(fixed_columns + ["created_at", "pm2_5"]).values.reshape(
+                new_row.drop(fixed_columns + ["timestamp", "pm2_5"]).values.reshape(
                     1, -1
                 )
             )[0]
@@ -524,7 +496,7 @@ def get_new_row(df, device1, model):
 
         forecasts["device_number"] = forecasts["device_number"].astype(int)
         forecasts["pm2_5"] = forecasts["pm2_5"].astype(float)
-        forecasts.rename(columns={"created_at": "time"}, inplace=True)
+        forecasts.rename(columns={"timestamp": "time"}, inplace=True)
         forecasts["time"] = pd.to_datetime(forecasts["time"], utc=True)
         current_time = datetime.utcnow()
         current_time_utc = pd.Timestamp(current_time, tz="UTC")
@@ -536,14 +508,14 @@ def get_new_row(df, device1, model):
 
     @staticmethod
     def generate_daily_forecasts(data, project_name, bucket_name, source_blob_name):
-        data["created_at"] = pd.to_datetime(data["created_at"])
+        data["timestamp"] = pd.to_datetime(data["timestamp"])
 
         def get_new_row(df_tmp, device, model):
             last_row = df_tmp[df_tmp["device_number"] == device].iloc[-1]
             new_row = pd.Series(index=last_row.index, dtype="float64")
             for i in fixed_columns:
                 new_row[i] = last_row[i]
-            new_row["created_at"] = last_row["created_at"] + pd.Timedelta(days=1)
+            new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(days=1)
             new_row["device_number"] = device
             new_row[f"pm2_5_last_1_day"] = last_row["pm2_5"]
             new_row[f"pm2_5_last_2_day"] = last_row[f"pm2_5_last_{1}_day"]
@@ -577,11 +549,11 @@ def get_new_row(df_tmp, device, model):
                         # Use the date of the new row to create other features
             attributes = ["year", "month", "day", "dayofweek"]
             for a in attributes:
-                new_row[a] = new_row["created_at"].__getattribute__(a)
-            new_row["week"] = new_row["created_at"].isocalendar().week
+                new_row[a] = new_row["timestamp"].__getattribute__(a)
+            new_row["week"] = new_row["timestamp"].isocalendar().week
 
             new_row["pm2_5"] = model.predict(
-                new_row.drop(fixed_columns + ["created_at", "pm2_5"]).values.reshape(
+                new_row.drop(fixed_columns + ["timestamp", "pm2_5"]).values.reshape(
                     1, -1
                 )
             )[0]
@@ -608,7 +580,7 @@ def get_new_row(df_tmp, device, model):
             forecasts = pd.concat([forecasts, test_copy], ignore_index=True)
         forecasts["device_number"] = forecasts["device_number"].astype(int)
         forecasts["pm2_5"] = forecasts["pm2_5"].astype(float)
-        forecasts.rename(columns={"created_at": "time"}, inplace=True)
+        forecasts.rename(columns={"timestamp": "time"}, inplace=True)
         current_time = datetime.utcnow()
         current_time_utc = pd.Timestamp(current_time, tz="UTC")
         result = forecasts[fixed_columns + ["time", "pm2_5", "device_number"]][
@@ -619,7 +591,7 @@ def get_new_row(df_tmp, device, model):
 
     @staticmethod
     def save_forecasts_to_mongo(data, frequency):
-        created_at = pd.to_datetime(datetime.now()).isoformat()
+        timestamp = pd.to_datetime(datetime.now()).isoformat()
         device_numbers = data["device_number"].unique()
         forecast_results = [
             {
@@ -628,7 +600,7 @@ def save_forecasts_to_mongo(data, frequency):
                 else data[data["device_number"] == i][field].tolist()
                 for field in data.columns
             }
-            | {"created_at": created_at}
+            | {"timestamp": timestamp}
             for i in device_numbers
         ]
         client = pm.MongoClient(configuration.MONGO_URI)
diff --git a/src/airflow/airqo_etl_utils/plume_labs_api.py b/src/airflow/airqo_etl_utils/plume_labs_api.py
index b3ba62e0ac..def07e7114 100644
--- a/src/airflow/airqo_etl_utils/plume_labs_api.py
+++ b/src/airflow/airqo_etl_utils/plume_labs_api.py
@@ -182,24 +182,25 @@ def __request(self, endpoint, params):
             total=5,
             backoff_factor=5,
         )
-        
+
         http = urllib3.PoolManager(retries=retry_strategy)
-        
+
         try:
             response = http.request(
-                "GET", 
-                url, 
-                fields=params,)
-            
+                "GET",
+                url,
+                fields=params,
+            )
+
             response_data = response.data
             print(response._request_url)
-            
+
             if response.status == 200:
                 return json.loads(response_data)
             else:
                 Utils.handle_api_error(response)
                 return None
-            
+
         except urllib3.exceptions.HTTPError as e:
             print(f"HTTPError: {e}")
             return None
diff --git a/src/airflow/airqo_etl_utils/purple_air_api.py b/src/airflow/airqo_etl_utils/purple_air_api.py
index b6dd0ec4d4..025f26283d 100644
--- a/src/airflow/airqo_etl_utils/purple_air_api.py
+++ b/src/airflow/airqo_etl_utils/purple_air_api.py
@@ -32,31 +32,31 @@ def get_data(
         return response if response else {}
 
     def __request(self, endpoint, params):
-
         url = f"{self.PURPLE_AIR_BASE_URL}{endpoint}"
         retry_strategy = Retry(
             total=5,
             backoff_factor=5,
         )
-        
+
         http = urllib3.PoolManager(retries=retry_strategy)
-        
+
         try:
             response = http.request(
-                "GET", 
-                url, 
+                "GET",
+                url,
                 fields=params,
-                headers={"x-api-key": self.PURPLE_AIR_API_KEY},)
-            
+                headers={"x-api-key": self.PURPLE_AIR_API_KEY},
+            )
+
             response_data = response.data
             print(response._request_url)
-            
+
             if response.status == 200:
                 return json.loads(response_data)
             else:
                 Utils.handle_api_error(response)
                 return None
-            
+
         except urllib3.exceptions.HTTPError as e:
             print(f"HTTPError: {e}")
             return None
diff --git a/src/airflow/airqo_etl_utils/tahmo_api.py b/src/airflow/airqo_etl_utils/tahmo_api.py
index 9f9db8a7f8..07a3e771eb 100644
--- a/src/airflow/airqo_etl_utils/tahmo_api.py
+++ b/src/airflow/airqo_etl_utils/tahmo_api.py
@@ -54,32 +54,29 @@ def get_measurements(self, start_time, end_time, station_codes=None):
         return measurements.to_dict(orient="records")
 
     def __request(self, endpoint, params):
-
         url = f"{self.BASE_URL}{endpoint}"
         retry_strategy = Retry(
             total=5,
             backoff_factor=5,
         )
-        
+
         http = urllib3.PoolManager(retries=retry_strategy)
-        
+
         try:
-            headers = urllib3.util.make_headers(basic_auth=f"{self.API_KEY}:{self.API_SECRET}")
-            response = http.request(
-                "GET", 
-                url, 
-                fields=params,
-                headers=headers)
-            
+            headers = urllib3.util.make_headers(
+                basic_auth=f"{self.API_KEY}:{self.API_SECRET}"
+            )
+            response = http.request("GET", url, fields=params, headers=headers)
+
             response_data = response.data
             print("Tahmo API request: %s" % response._request_url)
-            
+
             if response.status == 200:
                 return json.loads(response_data)
             else:
                 Utils.handle_api_error(response)
                 return None
-            
+
         except urllib3.exceptions.HTTPError as e:
             print(f"HTTPError: {e}")
             return None

From 9ec80cc95deecb53c43bb003da0f12f431ec7c18 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Mon, 21 Aug 2023 09:37:20 +0300
Subject: [PATCH 03/43] Fix category encoder

---
 src/airflow/airqo_etl_utils/bigquery_api.py |  2 +-
 src/airflow/airqo_etl_utils/ml_utils.py     | 64 ++++++++++-----------
 2 files changed, 30 insertions(+), 36 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/bigquery_api.py b/src/airflow/airqo_etl_utils/bigquery_api.py
index 82212655e2..360fb74df0 100644
--- a/src/airflow/airqo_etl_utils/bigquery_api.py
+++ b/src/airflow/airqo_etl_utils/bigquery_api.py
@@ -639,7 +639,7 @@ def fetch_training_data(
             t1.device_id, 
             t1.timestamp,  
             t1.site_id, 
-            t1.pm2_5_calibrated_value, 
+            t1.pm2_5_calibrated_value as pm2_5, 
             t2.latitude, 
             t2.longitude, 
             t3.device_category 
diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index ef9af89e76..3d551aa72a 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -1,4 +1,5 @@
 import json
+import random
 from datetime import datetime
 
 import gcsfs
@@ -65,17 +66,17 @@ class ForecastUtils:
     @staticmethod
     def preprocess_training_data(data, frequency):
         data["timestamp"] = pd.to_datetime(data["timestamp"])
-        data["pm2_5"] = data.groupby("device_id")["pm2_5"].transform(
+        data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])["pm2_5"].transform(
             lambda x: x.interpolate(method="linear", limit_direction="both")
         )
         if frequency == "daily":
             data = (
-                data.groupby(["device_id"])
+                data.groupby(["device_id", "site_id", "device_category"])
                 .resample("D", on="timestamp")
                 .mean(numeric_only=True)
             )
             data.reset_index(inplace=True)
-            data["pm2_5"] = data.groupby("device_id")["pm2_5"].transform(
+            data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])["pm2_5"].transform(
                 lambda x: x.interpolate(method="linear", limit_direction="both")
             )
         data = data.dropna(subset=["pm2_5"])
@@ -84,8 +85,6 @@ def preprocess_training_data(data, frequency):
     @staticmethod
     def feature_eng_training_data(data, target_column, frequency):
         def get_lag_features(df, target_col, freq):
-            df = df.sort_values(by=["device_id", "timestamp"])
-
             if freq == "daily":
                 shifts = [1, 2, 3, 7, 14]
                 for s in shifts:
@@ -125,30 +124,30 @@ def get_lag_features(df, target_col, freq):
 
             return df
 
-        def count_encode_categorical_features(df):
-            device_id_mappings, site_id_mappings, device_category_mappings = {}, {}, {}
-            for col in ["device_id", "site_id", "device_category"]:
-                counts = df[col].value_counts()
-                count_dict = dict(zip(counts.index, counts.values))
-                if col == "device_id":
-                    device_id_mappings = count_dict
-                elif col == "site_id":
-                    site_id_mappings = count_dict
-                elif col == "device_category":
-                    device_category_mappings = count_dict
-                df[f"{col}"] = df[col].map(count_dict)
-            mappings = [device_id_mappings, site_id_mappings, device_category_mappings]
-            for mapping in mappings:
-                upload_mapping_to_gcs(mapping, project_id, bucket, f"{mapping}.json")
-
+        def encode_categorical_features(df):
+            columns = ["device_id", "site_id", "device_category"]
+            mappings = []
+            for col in columns:
+                mapping = {}
+                for val in df[col].unique():
+                    num = random.randint(0, 10000)
+                    while num in mapping.values():
+                        num = random.randint(0, 10000)
+                    mapping[val] = num
+                df[col] = df[col].map(mapping)
+                mappings.append(mapping)
+            for i, col in enumerate(columns):
+                upload_mapping_to_gcs(
+                    mappings[i], project_id, bucket, f"{col}_mapping.json"
+                )
             return df
 
         def get_time_and_cyclic_features(df, freq):
-            attributes = ["year", "month", "day", "dayofweek", "hour"]
+            attributes = ["year", "month", "day", "dayofweek"]
             max_vals = [2023, 12, 31, 6, 23]
             if freq == "hourly":
-                attributes.append("minute")
-                max_vals.append(59)
+                attributes.extend(["hour", "minute"])
+                max_vals.append([23, 59])
             for a, m in zip(attributes, max_vals):
                 df[a] = df["timestamp"].dt.__getattribute__(a)
                 df[a + "_sin"] = np.sin(2 * np.pi * df[a] / m)
@@ -162,7 +161,7 @@ def get_time_and_cyclic_features(df, freq):
 
         data["timestamp"] = pd.to_datetime(data["timestamp"])
         df_tmp = get_lag_features(data, target_column, frequency)
-        df_tmp = count_encode_categorical_features(df_tmp)
+        df_tmp = encode_categorical_features(df_tmp)
         df_tmp = get_time_and_cyclic_features(df_tmp, frequency)
 
         return df_tmp
@@ -173,24 +172,19 @@ def train_and_save_forecast_models(train, frequency):
         Perform the actual training for hourly data
         """
         train["timestamp"] = pd.to_datetime(train["timestamp"])
-        train = train.sort_values(by=["device_id", "timestamp"])
         features = [c for c in train.columns if c not in ["timestamp", "pm2_5"]]
         print(features)
         target_col = "pm2_5"
-        train_data, validation_data, test_data = (
-            pd.DataFrame(),
-            pd.DataFrame(),
-            pd.DataFrame(),
-        )
+        train_data = validation_data = test_data = pd.DataFrame()
         for device in train["device_id"].unique():
             device_df = train[train["device_id"] == device]
             device_df = device_df.sort_values(by="timestamp")
             months = device_df["timestamp"].dt.month.unique()
-            train_months = val_months = test_months = 0
+            train_months = val_months = test_months = []
             if frequency == "hourly":
-                train_months = months[:8]
-                val_months = months[8:9]
-                test_months = months[9:]
+                train_months = months[:10]
+                val_months = months[10:11]
+                test_months = months[11:]
             elif frequency == "daily":
                 train_months = months[:10]
                 val_months = months[10:11]

From 1f766234753fcdbf9784ee4a0a7ae6787f0ff5d0 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Mon, 21 Aug 2023 23:03:16 +0300
Subject: [PATCH 04/43] code cleanup - training job

---
 src/airflow/airqo_etl_utils/ml_utils.py | 123 ++++++++++++------------
 1 file changed, 61 insertions(+), 62 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index 3d551aa72a..b1131e53ad 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -9,7 +9,7 @@
 import optuna
 import pandas as pd
 import pymongo as pm
-from lightgbm import LGBMRegressor
+from lightgbm import LGBMRegressor, early_stopping
 from scipy.stats import skew
 from sklearn.metrics import mean_squared_error
 
@@ -178,17 +178,16 @@ def train_and_save_forecast_models(train, frequency):
         train_data = validation_data = test_data = pd.DataFrame()
         for device in train["device_id"].unique():
             device_df = train[train["device_id"] == device]
-            device_df = device_df.sort_values(by="timestamp")
             months = device_df["timestamp"].dt.month.unique()
             train_months = val_months = test_months = []
             if frequency == "hourly":
-                train_months = months[:10]
-                val_months = months[10:11]
-                test_months = months[11:]
+                train_months = months[:8]
+                val_months = months[9]
+                test_months = months[10]
             elif frequency == "daily":
-                train_months = months[:10]
-                val_months = months[10:11]
-                test_months = months[11:]
+                train_months = months[:8]
+                val_months = months[8:9]
+                test_months = months[9:]
 
             train_df = device_df[device_df["timestamp"].dt.month.isin(train_months)]
             val_df = device_df[device_df["timestamp"].dt.month.isin(val_months)]
@@ -207,11 +206,6 @@ def train_and_save_forecast_models(train, frequency):
             test_data[target_col],
         )
 
-        mlflow.set_tracking_uri(configuration.MLFLOW_TRACKING_URI)
-        mlflow.set_experiment(f"LGBM_{frequency}_forecast_model_{environment}")
-        registered_model_name = f"LGBM_{frequency}_forecast_model_{environment}"
-
-        mlflow.lightgbm.autolog(registered_model_name=registered_model_name)
 
         sampler = optuna.samplers.TPESampler()
         pruner = optuna.pruners.SuccessiveHalvingPruner(
@@ -223,21 +217,18 @@ def train_and_save_forecast_models(train, frequency):
 
         def objective(trial):
             param_grid = {
-                "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.1, 1),
-                "reg_alpha": trial.suggest_uniform("reg_alpha", 0, 10),
-                "reg_lambda": trial.suggest_uniform("reg_lambda", 0, 10),
-                "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
+                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1),
+                "reg_alpha": trial.suggest_float("reg_alpha", 0, 10),
+                "reg_lambda": trial.suggest_float("reg_lambda", 0, 10),
+                "n_estimators": trial.suggest_categorical("n_estimators", [50]),
                 "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
-                "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
-                "max_depth": trial.suggest_int("max_depth", 3, 12),
-                "min_split_gain": trial.suggest_float("min_gain_to_split", 0, 15),
+                "num_leaves": trial.suggest_int("num_leaves", 20, 50),
+                "max_depth": trial.suggest_int("max_depth", 4, 7),
             }
-
             score = 0
             for step in range(4):
                 lgb_reg = LGBMRegressor(
                     objective="regression",
-                    n_jobs=2,
                     random_state=42,
                     **param_grid,
                     verbosity=2,
@@ -248,6 +239,7 @@ def objective(trial):
                     categorical_feature=["device_id", "site_id", "device_category"],
                     eval_set=[(test_data[features], test_target)],
                     eval_metric="rmse",
+                    callbacks=[early_stopping(stopping_rounds=150)],
                 )
 
                 val_preds = lgb_reg.predict(validation_data[features])
@@ -257,8 +249,14 @@ def objective(trial):
 
             return score
 
-        study.optimize(objective, n_trials=150)
+        study.optimize(objective, n_trials=15)
+
+
+        mlflow.set_tracking_uri(configuration.MLFLOW_TRACKING_URI)
+        mlflow.set_experiment(f"{frequency}_forecast_model_{environment}")
+        registered_model_name = f"{frequency}_forecast_model_{environment}"
 
+        mlflow.lightgbm.autolog(registered_model_name=registered_model_name, log_datasets=False)
         with mlflow.start_run():
             best_params = study.best_params
             print(f"Best params are {best_params}")
@@ -279,52 +277,53 @@ def objective(trial):
                 eval_set=[(test_data[features], test_target)],
                 eval_metric="rmse",
                 categorical_feature=["device_id", "site_id", "device_category"],
+                callbacks=[early_stopping(stopping_rounds=150)],
             )
 
             # train quantile regression models for 0.025 and 0.975 quantiles
-            clf_025 = LGBMRegressor(
-                n_estimators=best_params["n_estimators"],
-                learning_rate=best_params["learning_rate"],
-                colsample_bytree=best_params["colsample_bytree"],
-                reg_alpha=best_params["reg_alpha"],
-                reg_lambda=best_params["reg_lambda"],
-                max_depth=best_params["max_depth"],
-                random_state=42,
-                verbosity=2,
-                objective="quantile",
-                alpha=0.025,
-                metric="quantile",
-            )
+        clf_025 = LGBMRegressor(
+            n_estimators=best_params["n_estimators"],
+            learning_rate=best_params["learning_rate"],
+            colsample_bytree=best_params["colsample_bytree"],
+            reg_alpha=best_params["reg_alpha"],
+            reg_lambda=best_params["reg_lambda"],
+            max_depth=best_params["max_depth"],
+            random_state=42,
+            verbosity=2,
+            objective="quantile",
+            alpha=0.025,
+            metric="quantile",
+        )
 
-            clf_025.fit(
-                train_data[features],
-                train_target,
-                eval_set=[(test_data[features], test_target)],
-                categorical_feature=["device_id", "site_id", "device_category"],
-            )
+        clf_025.fit(
+            train_data[features],
+            train_target,
+            eval_set=[(test_data[features], test_target)],
+            categorical_feature=["device_id", "site_id", "device_category"],
+        )
 
-            clf_975 = LGBMRegressor(
-                n_estimators=best_params["n_estimators"],
-                learning_rate=best_params["learning_rate"],
-                colsample_bytree=best_params["colsample_bytree"],
-                reg_alpha=best_params["reg_alpha"],
-                reg_lambda=best_params["reg_lambda"],
-                max_depth=best_params["max_depth"],
-                random_state=42,
-                verbosity=2,
-                objective="quantile",
-                alpha=0.975,
-                metric="quantile",
-            )
+        clf_975 = LGBMRegressor(
+            n_estimators=best_params["n_estimators"],
+            learning_rate=best_params["learning_rate"],
+            colsample_bytree=best_params["colsample_bytree"],
+            reg_alpha=best_params["reg_alpha"],
+            reg_lambda=best_params["reg_lambda"],
+            max_depth=best_params["max_depth"],
+            random_state=42,
+            verbosity=2,
+            objective="quantile",
+            alpha=0.975,
+            metric="quantile",
+        )
 
-            clf_975.fit(
-                train_data[features],
-                train_target,
-                eval_set=[(test_data[features], test_target)],
-                categorical_feature=["device_id", "site_id", "device_category"],
-            )
+        clf_975.fit(
+            train_data[features],
+            train_target,
+            eval_set=[(test_data[features], test_target)],
+            categorical_feature=["device_id", "site_id", "device_category"],
+        )
 
-            upload_trained_model_to_gcs(
+        upload_trained_model_to_gcs(
                 clf, project_id, bucket, "hourly_forecast_model.pkl"
             )
 

From 365746cde7b0e1e05dfe603953c6b42c734f3c84 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Mon, 21 Aug 2023 23:05:57 +0300
Subject: [PATCH 05/43] setup actual forecasting job

---
 src/airflow/airqo_etl_utils/ml_utils.py | 89 +------------------------
 src/airflow/dags/ml_training_jobs.py    |  4 +-
 2 files changed, 3 insertions(+), 90 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index b1131e53ad..a1e07f9a79 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -62,9 +62,8 @@ def get_mapping_from_gcs(project_name, bucket_name, source_blob_name):
 
 
 class ForecastUtils:
-    # FORECAST MODEL TRAINING UTILS
     @staticmethod
-    def preprocess_training_data(data, frequency):
+    def preprocess__data(data, frequency):
         data["timestamp"] = pd.to_datetime(data["timestamp"])
         data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])["pm2_5"].transform(
             lambda x: x.interpolate(method="linear", limit_direction="both")
@@ -329,92 +328,6 @@ def objective(trial):
 
     #### FORECAST JOB UTILS ####
 
-    @staticmethod
-    def preprocess_historical_data(data, frequency):
-        data["timestamp"] = pd.to_datetime(data["timestamp"])
-        data["device_number"] = data["device_number"].astype(str)
-        data["pm2_5"] = data.groupby(fixed_columns + ["device_number"])[
-            "pm2_5"
-        ].transform(lambda x: x.interpolate(method="linear", limit_direction="both"))
-        if frequency == "hourly":
-            data.sort_values(
-                by=fixed_columns + ["device_number", "timestamp"], inplace=True
-            )
-        elif frequency == "daily":
-            data = (
-                data.groupby(fixed_columns + ["device_number"])
-                .resample("D", on="timestamp")
-                .mean(numeric_only=True)
-            )
-            data.reset_index(inplace=True)
-            data["pm2_5"] = data.groupby(fixed_columns + ["device_number"])[
-                "pm2_5"
-            ].transform(
-                lambda x: x.interpolate(method="linear", limit_direction="both")
-            )
-            data.sort_values(
-                by=fixed_columns + ["device_number", "timestamp"], inplace=True
-            )
-        else:
-            raise ValueError("Invalid frequency argument")
-        data["device_number"] = data["device_number"].astype(int)
-        data = data.dropna(subset=["pm2_5"])
-        return data
-
-    @staticmethod
-    def get_lag_features(df_tmp, TARGET_COL, frequency):
-        df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"])
-        df_tmp = df_tmp.sort_values(by=fixed_columns + ["device_number", "timestamp"])
-        if frequency == "hourly":
-            shifts = [1, 2]
-            for s in shifts:
-                df_tmp[f"pm2_5_last_{s}_hour"] = df_tmp.groupby(["device_number"])[
-                    TARGET_COL
-                ].shift(s)
-
-            shifts = [6, 12, 24, 48]
-            functions = ["mean", "std", "median", "skew"]
-            for s in shifts:
-                for f in functions:
-                    df_tmp[f"pm2_5_{f}_{s}_hour"] = (
-                        df_tmp.groupby(["device_number"])[TARGET_COL]
-                        .shift(1)
-                        .rolling(s)
-                        .agg(f)
-                    )
-        elif frequency == "daily":
-            shifts = [1, 2]
-            for s in shifts:
-                df_tmp[f"pm2_5_last_{s}_day"] = df_tmp.groupby(["device_number"])[
-                    TARGET_COL
-                ].shift(s)
-            shifts = [3, 7, 14, 30]
-            functions = ["mean", "std", "max", "min"]
-            for s in shifts:
-                for f in functions:
-                    df_tmp[f"pm2_5_{f}_{s}_day"] = (
-                        df_tmp.groupby(["device_number"])[TARGET_COL]
-                        .shift(1)
-                        .rolling(s)
-                        .agg(f)
-                    )
-        else:
-            raise ValueError("Invalid frequency argument")
-        print("Adding lag features")
-        return df_tmp
-
-    @staticmethod
-    def get_time_features(df_tmp, frequency):
-        df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"])
-        attributes = ["year", "month", "day", "dayofweek"]
-        if frequency == "hourly":
-            attributes.extend(["hour", "minute"])
-        for a in attributes:
-            df_tmp[a] = df_tmp["timestamp"].dt.__getattribute__(a)
-
-        df_tmp["week"] = df_tmp["timestamp"].dt.isocalendar().week
-        print("Adding other features")
-        return df_tmp
 
     @staticmethod
     def generate_hourly_forecasts(data, project_name, bucket_name, source_blob_name):
diff --git a/src/airflow/dags/ml_training_jobs.py b/src/airflow/dags/ml_training_jobs.py
index 180f7f7ef2..69eaaa900f 100644
--- a/src/airflow/dags/ml_training_jobs.py
+++ b/src/airflow/dags/ml_training_jobs.py
@@ -29,7 +29,7 @@ def fetch_training_data_for_hourly_forecast_model():
 
     @task()
     def preprocess_training_data_for_hourly_forecast_model(data):
-        return ForecastUtils.preprocess_training_data(data, "hourly")
+        return ForecastUtils.preprocess__data(data, "hourly")
 
     @task()
     def feature_engineer_training_data_for_hourly_forecast_model(data):
@@ -53,7 +53,7 @@ def fetch_training_data_for_daily_forecast_model():
 
     @task()
     def preprocess_training_data_for_daily_forecast_model(data):
-        return ForecastUtils.preprocess_training_data(data, "daily")
+        return ForecastUtils.preprocess__data(data, "daily")
 
     @task()
     def feature_engineer_data_for_daily_forecast_model(data):

From 6372a12151b26258af6011910a28c39e19bde538 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Tue, 22 Aug 2023 09:41:50 +0300
Subject: [PATCH 06/43] update forecasting job to match training job

---
 src/airflow/airqo_etl_utils/ml_utils.py | 245 ++++++++----------------
 src/airflow/dags/ml_training_jobs.py    |   4 +-
 2 files changed, 83 insertions(+), 166 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index a1e07f9a79..d900eac7e2 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -10,7 +10,6 @@
 import pandas as pd
 import pymongo as pm
 from lightgbm import LGBMRegressor, early_stopping
-from scipy.stats import skew
 from sklearn.metrics import mean_squared_error
 
 from .config import configuration
@@ -63,7 +62,7 @@ def get_mapping_from_gcs(project_name, bucket_name, source_blob_name):
 
 class ForecastUtils:
     @staticmethod
-    def preprocess__data(data, frequency):
+    def preprocess_data(data, frequency):
         data["timestamp"] = pd.to_datetime(data["timestamp"])
         data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])["pm2_5"].transform(
             lambda x: x.interpolate(method="linear", limit_direction="both")
@@ -82,7 +81,7 @@ def preprocess__data(data, frequency):
         return data
 
     @staticmethod
-    def feature_eng_training_data(data, target_column, frequency):
+    def feature_eng_data(data, target_column, frequency, job_type):
         def get_lag_features(df, target_col, freq):
             if freq == "daily":
                 shifts = [1, 2, 3, 7, 14]
@@ -155,13 +154,24 @@ def get_time_and_cyclic_features(df, freq):
             df["week"] = df["timestamp"].dt.isocalendar().week
             df["week_sin"] = np.sin(2 * np.pi * df["week"] / 52)
             df["week_cos"] = np.cos(2 * np.pi * df["week"] / 52)
-            df.drop(columns=attributes, inplace=True)
+            df.drop(columns=attributes + ["week"], inplace=True)
             return df
 
+        def decode_categorical_features(df):
+            columns = ["device_id", "site_id", "device_category"]
+            for col in columns:
+                mapping = get_mapping_from_gcs(
+                    project_id, bucket, f"{col}_mapping.json"
+                )
+                df[col] = df[col].map(mapping)
+            return df
         data["timestamp"] = pd.to_datetime(data["timestamp"])
         df_tmp = get_lag_features(data, target_column, frequency)
-        df_tmp = encode_categorical_features(df_tmp)
         df_tmp = get_time_and_cyclic_features(df_tmp, frequency)
+        if job_type == "train":
+            df_tmp = encode_categorical_features(df_tmp)
+        elif job_type == "predict":
+            df_tmp = decode_categorical_features(df_tmp)
 
         return df_tmp
 
@@ -326,173 +336,80 @@ def objective(trial):
                 clf, project_id, bucket, "hourly_forecast_model.pkl"
             )
 
-    #### FORECAST JOB UTILS ####
-
-
-    @staticmethod
-    def generate_hourly_forecasts(data, project_name, bucket_name, source_blob_name):
-        data["timestamp"] = pd.to_datetime(data["timestamp"])
-
-        def get_new_row(df, device1, model):
-            last_row = df[df["device_number"] == device1].iloc[-1]
-            new_row = pd.Series(index=last_row.index, dtype="float64")
-            for i in fixed_columns:
-                new_row[i] = last_row[i]
-            new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(hours=1)
-            new_row["device_number"] = device1
-            new_row[f"pm2_5_last_1_hour"] = last_row["pm2_5"]
-            new_row[f"pm2_5_last_2_hour"] = last_row[f"pm2_5_last_{1}_hour"]
-
-            shifts = [6, 12, 24, 48]
-            functions = ["mean", "std", "median", "skew"]
-            for s in shifts:
-                for f in functions:
-                    if f == "mean":
-                        new_row[f"pm2_5_{f}_{s}_hour"] = (
-                            last_row["pm2_5"]
-                            + last_row[f"pm2_5_{f}_{s}_hour"] * (s - 1)
-                        ) / s
-                    elif f == "std":
-                        new_row[f"pm2_5_{f}_{s}_hour"] = (
-                            np.sqrt(
-                                (last_row["pm2_5"] - last_row[f"pm2_5_mean_{s}_hour"])
-                                ** 2
-                                + (last_row[f"pm2_5_{f}_{s}_hour"] ** 2 * (s - 1))
-                            )
-                            / s
-                        )
-                    elif f == "median":
-                        new_row[f"pm2_5_{f}_{s}_hour"] = np.median(
-                            np.append(
-                                last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_hour"]
-                            )
-                        )
-                    elif f == "skew":
-                        new_row[f"pm2_5_{f}_{s}_hour"] = skew(
-                            np.append(
-                                last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_hour"]
-                            )
-                        )
-
-            attributes = ["year", "month", "day", "dayofweek", "hour", "minute"]
-            for a in attributes:
-                new_row[a] = new_row["timestamp"].__getattribute__(a)
-                new_row["week"] = new_row["timestamp"].isocalendar().week
-
-            new_row["pm2_5"] = model.predict(
-                new_row.drop(fixed_columns + ["timestamp", "pm2_5"]).values.reshape(
-                    1, -1
-                )
-            )[0]
-            return new_row
-
-        forecasts = pd.DataFrame()
-        forecast_model = get_trained_model_from_gcs(
-            project_name, bucket_name, source_blob_name
-        )
-        df_tmp = data.copy()
-        for device in df_tmp["device_number"].unique():
-            test_copy = df_tmp[df_tmp["device_number"] == device]
-            for i in range(int(configuration.HOURLY_FORECAST_HORIZON)):
-                new_row = get_new_row(test_copy, device, forecast_model)
-                test_copy = pd.concat(
-                    [test_copy, new_row.to_frame().T], ignore_index=True
-                )
-            forecasts = pd.concat([forecasts, test_copy], ignore_index=True)
-
-        forecasts["device_number"] = forecasts["device_number"].astype(int)
-        forecasts["pm2_5"] = forecasts["pm2_5"].astype(float)
-        forecasts.rename(columns={"timestamp": "time"}, inplace=True)
-        forecasts["time"] = pd.to_datetime(forecasts["time"], utc=True)
-        current_time = datetime.utcnow()
-        current_time_utc = pd.Timestamp(current_time, tz="UTC")
-        result = forecasts[fixed_columns + ["time", "pm2_5", "device_number"]][
-            forecasts["time"] >= current_time_utc
-        ]
-
-        return result
-
     @staticmethod
-    def generate_daily_forecasts(data, project_name, bucket_name, source_blob_name):
-        data["timestamp"] = pd.to_datetime(data["timestamp"])
-
-        def get_new_row(df_tmp, device, model):
-            last_row = df_tmp[df_tmp["device_number"] == device].iloc[-1]
-            new_row = pd.Series(index=last_row.index, dtype="float64")
-            for i in fixed_columns:
-                new_row[i] = last_row[i]
-            new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(days=1)
-            new_row["device_number"] = device
-            new_row[f"pm2_5_last_1_day"] = last_row["pm2_5"]
-            new_row[f"pm2_5_last_2_day"] = last_row[f"pm2_5_last_{1}_day"]
-
-            shifts = [3, 7, 14, 30]
-            functions = ["mean", "std", "max", "min"]
-            for s in shifts:
-                for f in functions:
-                    if f == "mean":
-                        new_row[f"pm2_5_{f}_{s}_day"] = (
-                            last_row["pm2_5"] + last_row[f"pm2_5_{f}_{s}_day"] * (s - 1)
-                        ) / s
-                    elif f == "std":
-                        new_row[f"pm2_5_{f}_{s}_day"] = (
-                            np.sqrt(
-                                (last_row["pm2_5"] - last_row[f"pm2_5_mean_{s}_day"])
-                                ** 2
-                                + (last_row[f"pm2_5_{f}_{s}_day"] ** 2 * (s - 1))
-                            )
-                            / s
-                        )
-                    elif f == "max":
-                        new_row[f"pm2_5_{f}_{s}_day"] = max(
-                            last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_day"]
-                        )
-                    elif f == "min":
-                        new_row[f"pm2_5_{f}_{s}_day"] = min(
-                            last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_day"]
-                        )
-
-                        # Use the date of the new row to create other features
-            attributes = ["year", "month", "day", "dayofweek"]
-            for a in attributes:
-                new_row[a] = new_row["timestamp"].__getattribute__(a)
-            new_row["week"] = new_row["timestamp"].isocalendar().week
+    def generate_forecasts(data, project_name, bucket_name, source_blob_name, frequency):
+        data['timestamp'] = pd.to_datetime(data['timestamp'])
+        data['pm2_5_lower'] = data['pm2_5_upper'] = data['margin_of_error'] = 0
+
+        def get_new_row(df, device_id, forecast_model, lower_quantile_model, upper_quantile_model,  frequency):
+            last_row = df[df['device_id'] == device_id].iloc[-1]
+            new_row = pd.Series(index=last_row.index, dtype='float64')
+            if frequency == 'hourly':
+                new_row['timestamp'] = last_row['timestamp'] + pd.Timedelta(hours=1)
+                new_row['device_id'] = device_id
+                new_row[f'pm2_5_last_1_hour'] = last_row['pm2_5']
+                new_row[f'pm2_5_last_2_hour'] = last_row[f'pm2_5_last_{1}_hour']
+            elif frequency == 'daily':
+                new_row['timestamp'] = last_row['timestamp'] + pd.Timedelta(days=1)
+                new_row['device_id'] = device_id
+                new_row[f'pm2_5_last_1_day'] = last_row['pm2_5']
+                new_row[f'pm2_5_last_2_day'] = last_row[f'pm2_5_last_{1}_day']
+                new_row[f'f"pm2_5_last_3_day'] = last_row[f'pm2_5_last_{2}_day']
+                shifts1 = [3, 7, 14]
+                for s in shifts1:
+                    new_row[f'pm2_5_last_{s}_day'] = df[df['device_id'] == device_id]['pm2_5'].shift(s).iloc[-1]
+
+                shifts2 = [3, 7, 14, 30]
+                functions = ['mean', 'std', 'max', 'min']
+                for s in shifts2:
+                    for f in functions:
+                        if f == 'mean':
+                            new_row[f'pm2_5_{f}_{s}_day'] = (last_row['pm2_5'] + last_row[f'pm2_5_{f}_{s}_day']*(s-1))/s
+                        elif f == 'std':
+                            new_row[f'pm2_5_{f}_{s}_day'] = np.sqrt((last_row['pm2_5'] - last_row[f'pm2_5_mean_{s}_day'])**2 + (last_row[f'pm2_5_{f}_{s}_day']**2*(s-1)))/s
+                        elif f == 'max':
+                            new_row[f'pm2_5_{f}_{s}_day'] = max(last_row['pm2_5'], last_row[f'pm2_5_{f}_{s}_day'])
+                        elif f == 'min':
+                            new_row[f'pm2_5_{f}_{s}_day'] = min(last_row['pm2_5'], last_row[f'pm2_5_{f}_{s}_day'])
+            attributes = ['year', 'month', 'day', 'dayofweek']
+            max_vals = [2023, 12, 31, 6, 23]
+            if frequency == 'hourly':
+                attributes.extend(['hour', 'minute'])
+                max_vals.append([23, 59])
+            for a, m in zip(attributes, max_vals):
+                new_row[a] = new_row['timestamp'].dt.__getattribute__(a)
+                new_row[a + '_sin'] = np.sin(2 * np.pi * new_row[a] / m)
+                new_row[a + '_cos'] = np.cos(2 * np.pi * new_row[a] / m)
+            new_row['week'] = new_row['timestamp'].dt.isocalendar().week
+            new_row['week_sin'] = np.sin(2 * np.pi * new_row['week'] / 52)
+            new_row['week_cos'] = np.cos(2 * np.pi * new_row['week'] / 52)
+            direct_forecast = forecast_model.predict(new_row.drop(['timestamp', 'pm2_5']).values.reshape(1, -1))[0]
+            new_row['pm2_5_lower'] = lower_quantile_model.predict(new_row.drop(['timestamp', 'pm2_5']).values.reshape(1, -1))[0]
+            new_row['pm2_5_upper'] = upper_quantile_model.predict(new_row.drop(['timestamp', 'pm2_5']).values.reshape(1, -1))[0]
+            new_row['margin_of_error'] = (new_row['pm2_5_upper'] - new_row['pm2_5_lower']) / 2
+            new_row['pm2_5'] = direct_forecast + new_row['margin_of_error']
 
-            new_row["pm2_5"] = model.predict(
-                new_row.drop(fixed_columns + ["timestamp", "pm2_5"]).values.reshape(
-                    1, -1
-                )
-            )[0]
             return new_row
 
         forecasts = pd.DataFrame()
-
-        forecast_model = get_trained_model_from_gcs(
-            project_name, bucket_name, source_blob_name
-        )
-
+        forecast_model = get_trained_model_from_gcs(project_name, bucket_name, source_blob_name)
+        lower_quantile_model = get_trained_model_from_gcs(project_name, bucket_name, 'daily_forecast_model_lower_quantile.pkl')
+        upper_quantile_model = get_trained_model_from_gcs(project_name, bucket_name, 'daily_forecast_model_upper_quantile.pkl')
         df_tmp = data.copy()
-        for device in df_tmp["device_number"].unique():
-            test_copy = df_tmp[df_tmp["device_number"] == device]
-            for i in range(int(configuration.DAILY_FORECAST_HORIZON)):
-                new_row = get_new_row(
-                    test_copy,
-                    device,
-                    forecast_model,
-                )
-                test_copy = pd.concat(
-                    [test_copy, new_row.to_frame().T], ignore_index=True
-                )
+        for device in df_tmp['device_id'].unique():
+            test_copy = df_tmp[df_tmp['device_id'] == device]
+            horizon = configuration.HOURLY_FORECAST_HORIZON if frequency == 'hourly' else configuration.DAILY_FORECAST_HORIZON
+            for i in range(int(horizon)):
+                new_row = get_new_row(test_copy, device, forecast_model, lower_quantile_model, upper_quantile_model, frequency)
+                test_copy = pd.concat([test_copy, new_row.to_frame().T], ignore_index=True)
             forecasts = pd.concat([forecasts, test_copy], ignore_index=True)
-        forecasts["device_number"] = forecasts["device_number"].astype(int)
-        forecasts["pm2_5"] = forecasts["pm2_5"].astype(float)
-        forecasts.rename(columns={"timestamp": "time"}, inplace=True)
-        current_time = datetime.utcnow()
-        current_time_utc = pd.Timestamp(current_time, tz="UTC")
-        result = forecasts[fixed_columns + ["time", "pm2_5", "device_number"]][
-            forecasts["time"] >= current_time_utc
-        ]
 
+        forecasts['pm2_5'] = forecasts['pm2_5'].astype(float)
+        forecasts['pm2_5_lower'] = forecasts['pm2_5_lower'].astype(float)
+        forecasts['pm2_5_upper'] = forecasts['pm2_5_upper'].astype(float)
+        forecasts['margin_of_error'] = forecasts['margin_of_error'].astype(float)
+        current_time_utc = pd.Timestamp(datetime.utcnow(), tz='UTC')
+        result = forecasts[['timestamp', 'pm2_5', 'pm2_5_lower', 'pm2_5_upper', 'margin_of_error', 'device_id', 'site_id']][forecasts['timestamp'] >= current_time_utc]
         return result
 
     @staticmethod
diff --git a/src/airflow/dags/ml_training_jobs.py b/src/airflow/dags/ml_training_jobs.py
index 69eaaa900f..05135dc2dc 100644
--- a/src/airflow/dags/ml_training_jobs.py
+++ b/src/airflow/dags/ml_training_jobs.py
@@ -29,7 +29,7 @@ def fetch_training_data_for_hourly_forecast_model():
 
     @task()
     def preprocess_training_data_for_hourly_forecast_model(data):
-        return ForecastUtils.preprocess__data(data, "hourly")
+        return ForecastUtils.preprocess_data(data, "hourly")
 
     @task()
     def feature_engineer_training_data_for_hourly_forecast_model(data):
@@ -53,7 +53,7 @@ def fetch_training_data_for_daily_forecast_model():
 
     @task()
     def preprocess_training_data_for_daily_forecast_model(data):
-        return ForecastUtils.preprocess__data(data, "daily")
+        return ForecastUtils.preprocess_data(data, "daily")
 
     @task()
     def feature_engineer_data_for_daily_forecast_model(data):

From 33ee0e44d2d5ca4e2722df2837923f716fc98eae Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Tue, 22 Aug 2023 12:12:42 +0300
Subject: [PATCH 07/43] setup dags for all jobs

---
 src/airflow/airflow-requirements.txt        |   2 +-
 src/airflow/airqo_etl_utils/bigquery_api.py |  17 +-
 src/airflow/airqo_etl_utils/ml_utils.py     | 287 ++++++++++++--------
 src/airflow/dags/ml_prediction_jobs.py      |  51 +---
 src/airflow/dags/ml_training_jobs.py        |  13 +-
 src/airflow/dev-requirements.txt            |   3 +-
 src/airflow/requirements.txt                |   5 +-
 7 files changed, 203 insertions(+), 175 deletions(-)

diff --git a/src/airflow/airflow-requirements.txt b/src/airflow/airflow-requirements.txt
index 48af8ae3aa..45dd7c89e4 100644
--- a/src/airflow/airflow-requirements.txt
+++ b/src/airflow/airflow-requirements.txt
@@ -7,4 +7,4 @@ lightgbm
 mlflow
 gcsfs
 pymongo
-category-encoders
\ No newline at end of file
+optuna
\ No newline at end of file
diff --git a/src/airflow/airqo_etl_utils/bigquery_api.py b/src/airflow/airqo_etl_utils/bigquery_api.py
index 360fb74df0..1d720b6772 100644
--- a/src/airflow/airqo_etl_utils/bigquery_api.py
+++ b/src/airflow/airqo_etl_utils/bigquery_api.py
@@ -615,22 +615,7 @@ def fetch_raw_readings(self) -> pd.DataFrame:
         except Exception as e:
             raise e
 
-    def fetch_data(self, start_date_time: str, historical: bool = False):
-        # historical is for the actual jobs, not training
-        query = f"""
-                SELECT DISTINCT timestamp as created_at, {"site_id," if historical else ""} device_number, pm2_5_calibrated_value as pm2_5
-                FROM `{self.hourly_measurements_table_prod}`
-                WHERE DATE(timestamp) >= '{start_date_time}' and device_number IS NOT NULL 
-                ORDER BY created_at, device_number
-        """
-
-        job_config = bigquery.QueryJobConfig()
-        job_config.use_query_cache = True
-
-        df = self.client.query(f"{query}", job_config).result().to_dataframe()
-        return df
-
-    def fetch_training_data(
+    def fetch_data(
         self,
         start_date_time: str,
     ) -> pd.DataFrame:
diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index d900eac7e2..68944efb59 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -64,9 +64,9 @@ class ForecastUtils:
     @staticmethod
     def preprocess_data(data, frequency):
         data["timestamp"] = pd.to_datetime(data["timestamp"])
-        data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])["pm2_5"].transform(
-            lambda x: x.interpolate(method="linear", limit_direction="both")
-        )
+        data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])[
+            "pm2_5"
+        ].transform(lambda x: x.interpolate(method="linear", limit_direction="both"))
         if frequency == "daily":
             data = (
                 data.groupby(["device_id", "site_id", "device_category"])
@@ -74,7 +74,9 @@ def preprocess_data(data, frequency):
                 .mean(numeric_only=True)
             )
             data.reset_index(inplace=True)
-            data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])["pm2_5"].transform(
+            data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])[
+                "pm2_5"
+            ].transform(
                 lambda x: x.interpolate(method="linear", limit_direction="both")
             )
         data = data.dropna(subset=["pm2_5"])
@@ -122,7 +124,7 @@ def get_lag_features(df, target_col, freq):
 
             return df
 
-        def encode_categorical_features(df):
+        def encode_categorical_features(df, frequency):
             columns = ["device_id", "site_id", "device_category"]
             mappings = []
             for col in columns:
@@ -136,13 +138,13 @@ def encode_categorical_features(df):
                 mappings.append(mapping)
             for i, col in enumerate(columns):
                 upload_mapping_to_gcs(
-                    mappings[i], project_id, bucket, f"{col}_mapping.json"
+                    mappings[i], project_id, bucket, f"{frequency}_{col}_mapping.json"
                 )
             return df
 
         def get_time_and_cyclic_features(df, freq):
             attributes = ["year", "month", "day", "dayofweek"]
-            max_vals = [2023, 12, 31, 6, 23]
+            max_vals = [2023, 12, 30, 7]
             if freq == "hourly":
                 attributes.extend(["hour", "minute"])
                 max_vals.append([23, 59])
@@ -157,14 +159,21 @@ def get_time_and_cyclic_features(df, freq):
             df.drop(columns=attributes + ["week"], inplace=True)
             return df
 
-        def decode_categorical_features(df):
+        def decode_categorical_features(df, frequency):
             columns = ["device_id", "site_id", "device_category"]
             for col in columns:
-                mapping = get_mapping_from_gcs(
-                    project_id, bucket, f"{col}_mapping.json"
+                if frequency == "hourly":
+                    mapping = get_mapping_from_gcs(
+                    project_id, bucket, f"hourly_{col}_mapping.json"
+                )
+                elif frequency == "daily":
+                    mapping = get_mapping_from_gcs(
+                    project_id, bucket, f"daily_{col}_mapping.json"
                 )
+                    
                 df[col] = df[col].map(mapping)
             return df
+
         data["timestamp"] = pd.to_datetime(data["timestamp"])
         df_tmp = get_lag_features(data, target_column, frequency)
         df_tmp = get_time_and_cyclic_features(df_tmp, frequency)
@@ -215,7 +224,6 @@ def train_and_save_forecast_models(train, frequency):
             test_data[target_col],
         )
 
-
         sampler = optuna.samplers.TPESampler()
         pruner = optuna.pruners.SuccessiveHalvingPruner(
             min_resource=10, reduction_factor=2, min_early_stopping_rate=0
@@ -260,12 +268,13 @@ def objective(trial):
 
         study.optimize(objective, n_trials=15)
 
-
         mlflow.set_tracking_uri(configuration.MLFLOW_TRACKING_URI)
         mlflow.set_experiment(f"{frequency}_forecast_model_{environment}")
         registered_model_name = f"{frequency}_forecast_model_{environment}"
 
-        mlflow.lightgbm.autolog(registered_model_name=registered_model_name, log_datasets=False)
+        mlflow.lightgbm.autolog(
+            registered_model_name=registered_model_name, log_datasets=False
+        )
         with mlflow.start_run():
             best_params = study.best_params
             print(f"Best params are {best_params}")
@@ -289,127 +298,181 @@ def objective(trial):
                 callbacks=[early_stopping(stopping_rounds=150)],
             )
 
-            # train quantile regression models for 0.025 and 0.975 quantiles
-        clf_025 = LGBMRegressor(
-            n_estimators=best_params["n_estimators"],
-            learning_rate=best_params["learning_rate"],
-            colsample_bytree=best_params["colsample_bytree"],
-            reg_alpha=best_params["reg_alpha"],
-            reg_lambda=best_params["reg_lambda"],
-            max_depth=best_params["max_depth"],
-            random_state=42,
-            verbosity=2,
-            objective="quantile",
-            alpha=0.025,
-            metric="quantile",
-        )
-
-        clf_025.fit(
-            train_data[features],
-            train_target,
-            eval_set=[(test_data[features], test_target)],
-            categorical_feature=["device_id", "site_id", "device_category"],
-        )
-
-        clf_975 = LGBMRegressor(
-            n_estimators=best_params["n_estimators"],
-            learning_rate=best_params["learning_rate"],
-            colsample_bytree=best_params["colsample_bytree"],
-            reg_alpha=best_params["reg_alpha"],
-            reg_lambda=best_params["reg_lambda"],
-            max_depth=best_params["max_depth"],
-            random_state=42,
-            verbosity=2,
-            objective="quantile",
-            alpha=0.975,
-            metric="quantile",
-        )
+            upload_trained_model_to_gcs(
+                clf, project_id, bucket, f"{frequency}_forecast_model.pkl"
+            )
 
-        clf_975.fit(
-            train_data[features],
-            train_target,
-            eval_set=[(test_data[features], test_target)],
-            categorical_feature=["device_id", "site_id", "device_category"],
-        )
+        alphas = [0.025, 0.975]
+        models = []
+        names = [f'{frequency}_lower_quantile_model', f'{frequency}_upper_quantile_model']
 
-        upload_trained_model_to_gcs(
-                clf, project_id, bucket, "hourly_forecast_model.pkl"
+        for alpha in alphas:
+            clf = LGBMRegressor(
+                n_estimators=best_params["n_estimators"],
+                learning_rate=best_params["learning_rate"],
+                colsample_bytree=best_params["colsample_bytree"],
+                reg_alpha=best_params["reg_alpha"],
+                reg_lambda=best_params["reg_lambda"],
+                max_depth=best_params["max_depth"],
+                random_state=42,
+                verbosity=2,
+                objective="quantile",
+                alpha=alpha,
+                metric="quantile",
             )
+            clf.fit(
+                train_data[features],
+                train_target,
+                eval_set=[(test_data[features], test_target)],
+                categorical_feature=["device_id", "site_id", "device_category"],
+            )
+            models.append(clf)
+        for n, m in zip(names, models):
+            upload_trained_model_to_gcs(
+                m, project_id, bucket, f"{n}.pkl"
+            )
+
 
     @staticmethod
-    def generate_forecasts(data, project_name, bucket_name, source_blob_name, frequency):
-        data['timestamp'] = pd.to_datetime(data['timestamp'])
-        data['pm2_5_lower'] = data['pm2_5_upper'] = data['margin_of_error'] = 0
-
-        def get_new_row(df, device_id, forecast_model, lower_quantile_model, upper_quantile_model,  frequency):
-            last_row = df[df['device_id'] == device_id].iloc[-1]
-            new_row = pd.Series(index=last_row.index, dtype='float64')
-            if frequency == 'hourly':
-                new_row['timestamp'] = last_row['timestamp'] + pd.Timedelta(hours=1)
-                new_row['device_id'] = device_id
-                new_row[f'pm2_5_last_1_hour'] = last_row['pm2_5']
-                new_row[f'pm2_5_last_2_hour'] = last_row[f'pm2_5_last_{1}_hour']
-            elif frequency == 'daily':
-                new_row['timestamp'] = last_row['timestamp'] + pd.Timedelta(days=1)
-                new_row['device_id'] = device_id
-                new_row[f'pm2_5_last_1_day'] = last_row['pm2_5']
-                new_row[f'pm2_5_last_2_day'] = last_row[f'pm2_5_last_{1}_day']
-                new_row[f'f"pm2_5_last_3_day'] = last_row[f'pm2_5_last_{2}_day']
+    def generate_forecasts(
+        data, project_name, bucket_name, frequency
+    ):
+        data["timestamp"] = pd.to_datetime(data["timestamp"])
+        data["pm2_5_lower"] = data["pm2_5_upper"] = data["margin_of_error"] = 0
+
+        def get_new_row(
+            df,
+            device_id,
+            forecast_model,
+            lower_quantile_model,
+            upper_quantile_model,
+            frequency,
+        ):
+            last_row = df[df["device_id"] == device_id].iloc[-1]
+            new_row = pd.Series(index=last_row.index, dtype="float64")
+            if frequency == "hourly":
+                new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(hours=1)
+                new_row["device_id"] = device_id
+                new_row[f"pm2_5_last_1_hour"] = last_row["pm2_5"]
+                new_row[f"pm2_5_last_2_hour"] = last_row[f"pm2_5_last_{1}_hour"]
+            elif frequency == "daily":
+                new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(days=1)
+                new_row["device_id"] = device_id
+                new_row[f"pm2_5_last_1_day"] = last_row["pm2_5"]
+                new_row[f"pm2_5_last_2_day"] = last_row[f"pm2_5_last_{1}_day"]
+                new_row[f'f"pm2_5_last_3_day'] = last_row[f"pm2_5_last_{2}_day"]
                 shifts1 = [3, 7, 14]
                 for s in shifts1:
-                    new_row[f'pm2_5_last_{s}_day'] = df[df['device_id'] == device_id]['pm2_5'].shift(s).iloc[-1]
+                    new_row[f"pm2_5_last_{s}_day"] = (
+                        df[df["device_id"] == device_id]["pm2_5"].shift(s).iloc[-1]
+                    )
 
                 shifts2 = [3, 7, 14, 30]
-                functions = ['mean', 'std', 'max', 'min']
+                functions = ["mean", "std", "max", "min"]
                 for s in shifts2:
                     for f in functions:
-                        if f == 'mean':
-                            new_row[f'pm2_5_{f}_{s}_day'] = (last_row['pm2_5'] + last_row[f'pm2_5_{f}_{s}_day']*(s-1))/s
-                        elif f == 'std':
-                            new_row[f'pm2_5_{f}_{s}_day'] = np.sqrt((last_row['pm2_5'] - last_row[f'pm2_5_mean_{s}_day'])**2 + (last_row[f'pm2_5_{f}_{s}_day']**2*(s-1)))/s
-                        elif f == 'max':
-                            new_row[f'pm2_5_{f}_{s}_day'] = max(last_row['pm2_5'], last_row[f'pm2_5_{f}_{s}_day'])
-                        elif f == 'min':
-                            new_row[f'pm2_5_{f}_{s}_day'] = min(last_row['pm2_5'], last_row[f'pm2_5_{f}_{s}_day'])
-            attributes = ['year', 'month', 'day', 'dayofweek']
+                        if f == "mean":
+                            new_row[f"pm2_5_{f}_{s}_day"] = (
+                                last_row["pm2_5"]
+                                + last_row[f"pm2_5_{f}_{s}_day"] * (s - 1)
+                            ) / s
+                        elif f == "std":
+                            new_row[f"pm2_5_{f}_{s}_day"] = (
+                                np.sqrt(
+                                    (
+                                        last_row["pm2_5"]
+                                        - last_row[f"pm2_5_mean_{s}_day"]
+                                    )
+                                    ** 2
+                                    + (last_row[f"pm2_5_{f}_{s}_day"] ** 2 * (s - 1))
+                                )
+                                / s
+                            )
+                        elif f == "max":
+                            new_row[f"pm2_5_{f}_{s}_day"] = max(
+                                last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_day"]
+                            )
+                        elif f == "min":
+                            new_row[f"pm2_5_{f}_{s}_day"] = min(
+                                last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_day"]
+                            )
+            attributes = ["year", "month", "day", "dayofweek"]
             max_vals = [2023, 12, 31, 6, 23]
-            if frequency == 'hourly':
-                attributes.extend(['hour', 'minute'])
+            if frequency == "hourly":
+                attributes.extend(["hour", "minute"])
                 max_vals.append([23, 59])
             for a, m in zip(attributes, max_vals):
-                new_row[a] = new_row['timestamp'].dt.__getattribute__(a)
-                new_row[a + '_sin'] = np.sin(2 * np.pi * new_row[a] / m)
-                new_row[a + '_cos'] = np.cos(2 * np.pi * new_row[a] / m)
-            new_row['week'] = new_row['timestamp'].dt.isocalendar().week
-            new_row['week_sin'] = np.sin(2 * np.pi * new_row['week'] / 52)
-            new_row['week_cos'] = np.cos(2 * np.pi * new_row['week'] / 52)
-            direct_forecast = forecast_model.predict(new_row.drop(['timestamp', 'pm2_5']).values.reshape(1, -1))[0]
-            new_row['pm2_5_lower'] = lower_quantile_model.predict(new_row.drop(['timestamp', 'pm2_5']).values.reshape(1, -1))[0]
-            new_row['pm2_5_upper'] = upper_quantile_model.predict(new_row.drop(['timestamp', 'pm2_5']).values.reshape(1, -1))[0]
-            new_row['margin_of_error'] = (new_row['pm2_5_upper'] - new_row['pm2_5_lower']) / 2
-            new_row['pm2_5'] = direct_forecast + new_row['margin_of_error']
+                new_row[a] = new_row["timestamp"].dt.__getattribute__(a)
+                new_row[a + "_sin"] = np.sin(2 * np.pi * new_row[a] / m)
+                new_row[a + "_cos"] = np.cos(2 * np.pi * new_row[a] / m)
+            new_row["week"] = new_row["timestamp"].dt.isocalendar().week
+            new_row["week_sin"] = np.sin(2 * np.pi * new_row["week"] / 52)
+            new_row["week_cos"] = np.cos(2 * np.pi * new_row["week"] / 52)
+            direct_forecast = forecast_model.predict(
+                new_row.drop(["timestamp", "pm2_5"]).values.reshape(1, -1)
+            )[0]
+            new_row["pm2_5_lower"] = lower_quantile_model.predict(
+                new_row.drop(["timestamp", "pm2_5"]).values.reshape(1, -1)
+            )[0]
+            new_row["pm2_5_upper"] = upper_quantile_model.predict(
+                new_row.drop(["timestamp", "pm2_5"]).values.reshape(1, -1)
+            )[0]
+            new_row["margin_of_error"] = (
+                new_row["pm2_5_upper"] - new_row["pm2_5_lower"]
+            ) / 2
+            new_row["pm2_5"] = direct_forecast + new_row["margin_of_error"]
 
             return new_row
 
         forecasts = pd.DataFrame()
-        forecast_model = get_trained_model_from_gcs(project_name, bucket_name, source_blob_name)
-        lower_quantile_model = get_trained_model_from_gcs(project_name, bucket_name, 'daily_forecast_model_lower_quantile.pkl')
-        upper_quantile_model = get_trained_model_from_gcs(project_name, bucket_name, 'daily_forecast_model_upper_quantile.pkl')
+        forecast_model = get_trained_model_from_gcs(
+            project_name, bucket_name, f"{frequency}_forecast_model.pkl"
+        )
+        lower_quantile_model = get_trained_model_from_gcs(
+            project_name, bucket_name,  f"{frequency}_lower_quantile_model.pkl"
+        )
+        upper_quantile_model = get_trained_model_from_gcs(
+            project_name, bucket_name, f"{frequency}_upper_quantile_model.pkl"
+        )
         df_tmp = data.copy()
-        for device in df_tmp['device_id'].unique():
-            test_copy = df_tmp[df_tmp['device_id'] == device]
-            horizon = configuration.HOURLY_FORECAST_HORIZON if frequency == 'hourly' else configuration.DAILY_FORECAST_HORIZON
+        for device in df_tmp["device_id"].unique():
+            test_copy = df_tmp[df_tmp["device_id"] == device]
+            horizon = (
+                configuration.HOURLY_FORECAST_HORIZON
+                if frequency == "hourly"
+                else configuration.DAILY_FORECAST_HORIZON
+            )
             for i in range(int(horizon)):
-                new_row = get_new_row(test_copy, device, forecast_model, lower_quantile_model, upper_quantile_model, frequency)
-                test_copy = pd.concat([test_copy, new_row.to_frame().T], ignore_index=True)
+                new_row = get_new_row(
+                    test_copy,
+                    device,
+                    forecast_model,
+                    lower_quantile_model,
+                    upper_quantile_model,
+                    frequency,
+                )
+                test_copy = pd.concat(
+                    [test_copy, new_row.to_frame().T], ignore_index=True
+                )
             forecasts = pd.concat([forecasts, test_copy], ignore_index=True)
 
-        forecasts['pm2_5'] = forecasts['pm2_5'].astype(float)
-        forecasts['pm2_5_lower'] = forecasts['pm2_5_lower'].astype(float)
-        forecasts['pm2_5_upper'] = forecasts['pm2_5_upper'].astype(float)
-        forecasts['margin_of_error'] = forecasts['margin_of_error'].astype(float)
-        current_time_utc = pd.Timestamp(datetime.utcnow(), tz='UTC')
-        result = forecasts[['timestamp', 'pm2_5', 'pm2_5_lower', 'pm2_5_upper', 'margin_of_error', 'device_id', 'site_id']][forecasts['timestamp'] >= current_time_utc]
+        forecasts["pm2_5"] = forecasts["pm2_5"].astype(float)
+        forecasts["pm2_5_lower"] = forecasts["pm2_5_lower"].astype(float)
+        forecasts["pm2_5_upper"] = forecasts["pm2_5_upper"].astype(float)
+        forecasts["margin_of_error"] = forecasts["margin_of_error"].astype(float)
+        current_time_utc = pd.Timestamp(datetime.utcnow(), tz="UTC")
+        forecasts.rename(columns={"timestamp": "time"}, inplace=True)
+        result = forecasts[
+            [
+                "timestamp",
+                "pm2_5",
+                "pm2_5_lower",
+                "pm2_5_upper",
+                "margin_of_error",
+                "device_id",
+                "site_id",
+            ]
+        ][forecasts["time"] >= current_time_utc]
         return result
 
     @staticmethod
diff --git a/src/airflow/dags/ml_prediction_jobs.py b/src/airflow/dags/ml_prediction_jobs.py
index 2f48d19d68..f90233afa4 100644
--- a/src/airflow/dags/ml_prediction_jobs.py
+++ b/src/airflow/dags/ml_prediction_jobs.py
@@ -27,25 +27,19 @@ def get_historical_data_for_hourly_forecasts():
         from airqo_etl_utils.date import date_to_str
 
         start_date = date_to_str(start_date, str_format="%Y-%m-%d")
-        return BigQueryApi().fetch_data(start_date, historical=True)
+        return BigQueryApi().fetch_data(start_date)
 
     @task()
     def preprocess_historical_data_hourly_forecast(data):
-        return ForecastUtils.preprocess_historical_data(data, "hourly")
-
-    @task()
-    def add_lag_features_historical_data_hourly_forecast(data):
-        return ForecastUtils.get_lag_features(data, "pm2_5", frequency="hourly")
+        return ForecastUtils.preprocess_data(data, "hourly")
 
     @task
-    def add_timestep_features_historical_data_hourly_forecasts(data):
-        return ForecastUtils.get_time_features(data, frequency="hourly")
+    def feature_eng_hourly_historical_data(data):
+        return ForecastUtils.feature_eng_data(data, 'pm2_5', 'hourly', 'predict')
 
     @task()
     def make_hourly_forecasts(data):
-        return ForecastUtils.generate_hourly_forecasts(
-            data, project_id, bucket, "hourly_forecast_model.pkl"
-        )
+        return ForecastUtils.generate_forecasts(data=data, project_name=project_id, bucket_name= bucket,frequency='hourly')
 
     @task()
     def save_hourly_forecasts_to_bigquery(data):
@@ -67,25 +61,20 @@ def get_historical_data_for_daily_forecasts():
             days=int(configuration.DAILY_FORECAST_PREDICTION_JOB_SCOPE)
         )
         start_date = date_to_str(start_date, str_format="%Y-%m-%d")
-        return BigQueryApi().fetch_data(start_date, historical=True)
+        return BigQueryApi().fetch_data(start_date)
 
     @task()
     def preprocess_historical_data_daily_forecast(data):
-        return ForecastUtils.preprocess_historical_data(data, "daily")
+        return ForecastUtils.preprocess_data(data, "daily")
 
     @task()
-    def add_lag_features_historical_data_daily_forecast(data):
-        return ForecastUtils.get_lag_features(data, "pm2_5", frequency="daily")
-
-    @task()
-    def add_timestep_features_historical_data_daily_forecast(data):
-        return ForecastUtils.get_time_features(data, "daily")
+    def feature_engineer_daily_historical_data(data):
+        return ForecastUtils.feature_eng_data(data, 'pm2_5', 'daily', 'predict')
 
     @task()
     def make_daily_forecasts(data):
-        return ForecastUtils.generate_daily_forecasts(
-            data, project_id, bucket, "daily_forecast_model.pkl"
-        )
+        return ForecastUtils.generate_forecasts(data, project_id, bucket, 'daily')
+
 
     @task()
     def save_daily_forecasts_to_bigquery(data):
@@ -99,25 +88,15 @@ def save_daily_forecasts_to_mongo(data):
 
     hourly_data = get_historical_data_for_hourly_forecasts()
     preprocessed_hourly_data = preprocess_historical_data_hourly_forecast(hourly_data)
-    lagged_hourly_data = add_lag_features_historical_data_hourly_forecast(
-        preprocessed_hourly_data
-    )
-    time_features_hourly_data = add_timestep_features_historical_data_hourly_forecasts(
-        lagged_hourly_data
-    )
-    hourly_forecasts = make_hourly_forecasts(time_features_hourly_data)
+    feat_data = feature_eng_hourly_historical_data(preprocessed_hourly_data)
+    hourly_forecasts = make_hourly_forecasts(feat_data)
     save_hourly_forecasts_to_bigquery(hourly_forecasts)
     save_hourly_forecasts_to_mongo(hourly_forecasts)
 
     daily_data = get_historical_data_for_daily_forecasts()
     preprocessed_daily_data = preprocess_historical_data_daily_forecast(daily_data)
-    lagged_daily_data = add_lag_features_historical_data_daily_forecast(
-        preprocessed_daily_data
-    )
-    time_features_daily_data = add_timestep_features_historical_data_daily_forecast(
-        lagged_daily_data
-    )
-    daily_forecasts = make_daily_forecasts(time_features_daily_data)
+    feat_data = feature_engineer_daily_historical_data(preprocessed_daily_data)
+    daily_forecasts = make_daily_forecasts(feat_data)
     save_daily_forecasts_to_bigquery(daily_forecasts)
     save_daily_forecasts_to_mongo(daily_forecasts)
 
diff --git a/src/airflow/dags/ml_training_jobs.py b/src/airflow/dags/ml_training_jobs.py
index 05135dc2dc..32287ee59c 100644
--- a/src/airflow/dags/ml_training_jobs.py
+++ b/src/airflow/dags/ml_training_jobs.py
@@ -15,6 +15,8 @@
     tags=["airqo", "hourly-forecast", "daily-forecast", "training-job"],
 )
 def train_forecasting_models():
+
+    # Hourly forecast tasks
     @task()
     def fetch_training_data_for_hourly_forecast_model():
         from dateutil.relativedelta import relativedelta
@@ -26,19 +28,20 @@ def fetch_training_data_for_hourly_forecast_model():
         )
         start_date = date_to_str(start_date, str_format="%Y-%m-%d")
         return BigQueryApi().fetch_data(start_date)
-
     @task()
     def preprocess_training_data_for_hourly_forecast_model(data):
         return ForecastUtils.preprocess_data(data, "hourly")
 
     @task()
     def feature_engineer_training_data_for_hourly_forecast_model(data):
-        return ForecastUtils.feature_eng_training_data(data, "pm2_5", "hourly")
+        return ForecastUtils.feature_eng_data(data, "pm2_5", "hourly", "train")
 
     @task()
     def train_and_save_hourly_forecast_model(train_data):
-        return ForecastUtils.train_and_save_hourly_forecast_model(train_data)
+        return ForecastUtils.train_and_save_forecast_models(train_data, frequency='hourly')
+
 
+# Daily forecast tasks
     @task()
     def fetch_training_data_for_daily_forecast_model():
         from dateutil.relativedelta import relativedelta
@@ -57,11 +60,11 @@ def preprocess_training_data_for_daily_forecast_model(data):
 
     @task()
     def feature_engineer_data_for_daily_forecast_model(data):
-        return ForecastUtils.feature_eng_training_data(data, "pm2_5", "daily")
+        return ForecastUtils.feature_eng_data(data, "pm2_5", "daily", "train")
 
     @task()
     def train_and_save_daily_model(train_data):
-        return ForecastUtils.train_and_save_daily_forecast_model(train_data)
+        return ForecastUtils.train_and_save_forecast_models(train_data, "daily")
 
     hourly_data = fetch_training_data_for_hourly_forecast_model()
     hourly_data = preprocess_training_data_for_hourly_forecast_model(hourly_data)
diff --git a/src/airflow/dev-requirements.txt b/src/airflow/dev-requirements.txt
index 59d0561bea..81c23b0562 100644
--- a/src/airflow/dev-requirements.txt
+++ b/src/airflow/dev-requirements.txt
@@ -18,5 +18,4 @@ mlflow
 lightgbm
 gcsfs
 pymongo
-pytest
-category_encoders
\ No newline at end of file
+pytest
\ No newline at end of file
diff --git a/src/airflow/requirements.txt b/src/airflow/requirements.txt
index 947c051adc..7396bc939c 100644
--- a/src/airflow/requirements.txt
+++ b/src/airflow/requirements.txt
@@ -16,7 +16,6 @@ joblib~=1.3.1
 scikit-learn~=1.3.0
 gcsfs
 pymongo~=4.4.1
-
+optuna
 pytest~=7.4.0
-scipy~=1.11.1
-category_encoders
\ No newline at end of file
+scipy~=1.11.1
\ No newline at end of file

From b5a5ac66f51757813534ec29827c9b312f157da8 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Tue, 22 Aug 2023 18:34:24 +0300
Subject: [PATCH 08/43] cleans up training job code

---
 src/airflow/airqo_etl_utils/ml_utils.py | 58 +++++++++++++------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index 68944efb59..719f136228 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -85,10 +85,11 @@ def preprocess_data(data, frequency):
     @staticmethod
     def feature_eng_data(data, target_column, frequency, job_type):
         def get_lag_features(df, target_col, freq):
+            df1 = df.copy()
             if freq == "daily":
                 shifts = [1, 2, 3, 7, 14]
                 for s in shifts:
-                    df[f"pm2_5_last_{s}_day"] = df.groupby(["device_id"])[
+                    df1[f"pm2_5_last_{s}_day"] = df1.groupby(["device_id"])[
                         target_col
                     ].shift(s)
 
@@ -96,8 +97,8 @@ def get_lag_features(df, target_col, freq):
                 functions = ["mean", "std", "max", "min"]
                 for s in shifts:
                     for f in functions:
-                        df[f"pm2_5_{f}_{s}_day"] = (
-                            df.groupby(["device_id"])[target_col]
+                        df1[f"pm2_5_{f}_{s}_day"] = (
+                            df1.groupby(["device_id"])[target_col]
                             .shift(1)
                             .rolling(s)
                             .agg(f)
@@ -105,7 +106,7 @@ def get_lag_features(df, target_col, freq):
             elif freq == "hourly":
                 shifts = [1, 2, 6, 12]
                 for s in shifts:
-                    df[f"pm2_5_last_{s}_hour"] = df.groupby(["device_id"])[
+                    df1[f"pm2_5_last_{s}_hour"] = df1.groupby(["device_id"])[
                         target_col
                     ].shift(s)
 
@@ -113,8 +114,8 @@ def get_lag_features(df, target_col, freq):
                 functions = ["mean", "std", "median", "skew"]
                 for s in shifts:
                     for f in functions:
-                        df[f"pm2_5_{f}_{s}_hour"] = (
-                            df.groupby(["device_id"])[target_col]
+                        df1[f"pm2_5_{f}_{s}_hour"] = (
+                            df1.groupby(["device_id"])[target_col]
                             .shift(1)
                             .rolling(s)
                             .agg(f)
@@ -122,42 +123,44 @@ def get_lag_features(df, target_col, freq):
             else:
                 raise ValueError("Invalid frequency")
 
-            return df
+            return df1
 
         def encode_categorical_features(df, frequency):
+            df1 = df.copy()
             columns = ["device_id", "site_id", "device_category"]
             mappings = []
             for col in columns:
                 mapping = {}
-                for val in df[col].unique():
+                for val in df1[col].unique():
                     num = random.randint(0, 10000)
                     while num in mapping.values():
                         num = random.randint(0, 10000)
                     mapping[val] = num
-                df[col] = df[col].map(mapping)
+                df1[col] = df1[col].map(mapping)
                 mappings.append(mapping)
             for i, col in enumerate(columns):
                 upload_mapping_to_gcs(
                     mappings[i], project_id, bucket, f"{frequency}_{col}_mapping.json"
                 )
-            return df
+            return df1
 
         def get_time_and_cyclic_features(df, freq):
+            df1 = df.copy()
             attributes = ["year", "month", "day", "dayofweek"]
             max_vals = [2023, 12, 30, 7]
             if freq == "hourly":
-                attributes.extend(["hour", "minute"])
-                max_vals.append([23, 59])
+                attributes.append("hour")
+                max_vals.append(23)
             for a, m in zip(attributes, max_vals):
-                df[a] = df["timestamp"].dt.__getattribute__(a)
-                df[a + "_sin"] = np.sin(2 * np.pi * df[a] / m)
-                df[a + "_cos"] = np.cos(2 * np.pi * df[a] / m)
-
-            df["week"] = df["timestamp"].dt.isocalendar().week
-            df["week_sin"] = np.sin(2 * np.pi * df["week"] / 52)
-            df["week_cos"] = np.cos(2 * np.pi * df["week"] / 52)
-            df.drop(columns=attributes + ["week"], inplace=True)
-            return df
+                df1[a] = df1["timestamp"].dt.__getattribute__(a)
+                df1[a + "_sin"] = np.sin(2 * np.pi * df1[a] / m)
+                df1[a + "_cos"] = np.cos(2 * np.pi * df1[a] / m)
+
+            df1["week"] = df1["timestamp"].dt.isocalendar().week
+            df1["week_sin"] = np.sin(2 * np.pi * df1["week"] / 52)
+            df1["week_cos"] = np.cos(2 * np.pi * df1["week"] / 52)
+            df1.drop(columns=attributes + ["week"], inplace=True)
+            return df1
 
         def decode_categorical_features(df, frequency):
             columns = ["device_id", "site_id", "device_category"]
@@ -174,11 +177,12 @@ def decode_categorical_features(df, frequency):
                 df[col] = df[col].map(mapping)
             return df
 
-        data["timestamp"] = pd.to_datetime(data["timestamp"])
-        df_tmp = get_lag_features(data, target_column, frequency)
+        df_tmp = data.copy()
+        df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"])
+        df_tmp = get_lag_features(df_tmp, target_column, frequency)
         df_tmp = get_time_and_cyclic_features(df_tmp, frequency)
         if job_type == "train":
-            df_tmp = encode_categorical_features(df_tmp)
+            df_tmp = encode_categorical_features(df_tmp, frequency)
         elif job_type == "predict":
             df_tmp = decode_categorical_features(df_tmp)
 
@@ -199,9 +203,9 @@ def train_and_save_forecast_models(train, frequency):
             months = device_df["timestamp"].dt.month.unique()
             train_months = val_months = test_months = []
             if frequency == "hourly":
-                train_months = months[:8]
-                val_months = months[9]
-                test_months = months[10]
+                train_months = months[:4]
+                val_months = months[4:5]
+                test_months = months[5:]
             elif frequency == "daily":
                 train_months = months[:8]
                 val_months = months[8:9]

From 255a0cd05c837ab283e2c542d3d0ef43d075d9b2 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Wed, 23 Aug 2023 01:14:53 +0300
Subject: [PATCH 09/43] refactor prediction job

---
 src/airflow/airqo_etl_utils/ml_utils.py | 193 ++++++++++++------------
 1 file changed, 94 insertions(+), 99 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index 719f136228..c13ba9d7c8 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -59,6 +59,20 @@ def get_mapping_from_gcs(project_name, bucket_name, source_blob_name):
         mapping_dict = json.load(f)
     return mapping_dict
 
+def decode_categorical_features(df, frequency):
+    columns = ["device_id", "site_id", "device_category"]
+    for col in columns:
+        if frequency == "hourly":
+            mapping = get_mapping_from_gcs(
+                project_id, bucket, f"hourly_{col}_mapping.json"
+            )
+        elif frequency == "daily":
+            mapping = get_mapping_from_gcs(
+                project_id, bucket, f"daily_{col}_mapping.json"
+            )
+
+        df[col] = df[col].map(mapping)
+    return df
 
 class ForecastUtils:
     @staticmethod
@@ -162,20 +176,7 @@ def get_time_and_cyclic_features(df, freq):
             df1.drop(columns=attributes + ["week"], inplace=True)
             return df1
 
-        def decode_categorical_features(df, frequency):
-            columns = ["device_id", "site_id", "device_category"]
-            for col in columns:
-                if frequency == "hourly":
-                    mapping = get_mapping_from_gcs(
-                    project_id, bucket, f"hourly_{col}_mapping.json"
-                )
-                elif frequency == "daily":
-                    mapping = get_mapping_from_gcs(
-                    project_id, bucket, f"daily_{col}_mapping.json"
-                )
-                    
-                df[col] = df[col].map(mapping)
-            return df
+
 
         df_tmp = data.copy()
         df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"])
@@ -184,7 +185,11 @@ def decode_categorical_features(df, frequency):
         if job_type == "train":
             df_tmp = encode_categorical_features(df_tmp, frequency)
         elif job_type == "predict":
-            df_tmp = decode_categorical_features(df_tmp)
+            df_tmp = decode_categorical_features(df_tmp, frequency)
+            #convert the categorical columns to int
+            df_tmp['device_id'] = df_tmp['device_id'].astype(int)
+            df_tmp['site_id'] = df_tmp['site_id'].astype(int)
+            df_tmp['device_category'] = df_tmp['device_category'].astype(int)
 
         return df_tmp
 
@@ -344,89 +349,77 @@ def generate_forecasts(
         data["timestamp"] = pd.to_datetime(data["timestamp"])
         data["pm2_5_lower"] = data["pm2_5_upper"] = data["margin_of_error"] = 0
 
-        def get_new_row(
+        def get_forecasts(
             df,
-            device_id,
             forecast_model,
             lower_quantile_model,
             upper_quantile_model,
             frequency,
+            horizon
         ):
-            last_row = df[df["device_id"] == device_id].iloc[-1]
-            new_row = pd.Series(index=last_row.index, dtype="float64")
-            if frequency == "hourly":
-                new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(hours=1)
-                new_row["device_id"] = device_id
-                new_row[f"pm2_5_last_1_hour"] = last_row["pm2_5"]
-                new_row[f"pm2_5_last_2_hour"] = last_row[f"pm2_5_last_{1}_hour"]
-            elif frequency == "daily":
-                new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(days=1)
-                new_row["device_id"] = device_id
-                new_row[f"pm2_5_last_1_day"] = last_row["pm2_5"]
-                new_row[f"pm2_5_last_2_day"] = last_row[f"pm2_5_last_{1}_day"]
-                new_row[f'f"pm2_5_last_3_day'] = last_row[f"pm2_5_last_{2}_day"]
-                shifts1 = [3, 7, 14]
-                for s in shifts1:
-                    new_row[f"pm2_5_last_{s}_day"] = (
-                        df[df["device_id"] == device_id]["pm2_5"].shift(s).iloc[-1]
-                    )
-
-                shifts2 = [3, 7, 14, 30]
-                functions = ["mean", "std", "max", "min"]
-                for s in shifts2:
-                    for f in functions:
-                        if f == "mean":
-                            new_row[f"pm2_5_{f}_{s}_day"] = (
-                                last_row["pm2_5"]
-                                + last_row[f"pm2_5_{f}_{s}_day"] * (s - 1)
-                            ) / s
-                        elif f == "std":
-                            new_row[f"pm2_5_{f}_{s}_day"] = (
-                                np.sqrt(
-                                    (
-                                        last_row["pm2_5"]
-                                        - last_row[f"pm2_5_mean_{s}_day"]
-                                    )
-                                    ** 2
-                                    + (last_row[f"pm2_5_{f}_{s}_day"] ** 2 * (s - 1))
-                                )
-                                / s
-                            )
-                        elif f == "max":
-                            new_row[f"pm2_5_{f}_{s}_day"] = max(
-                                last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_day"]
-                            )
-                        elif f == "min":
-                            new_row[f"pm2_5_{f}_{s}_day"] = min(
-                                last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_day"]
-                            )
-            attributes = ["year", "month", "day", "dayofweek"]
-            max_vals = [2023, 12, 31, 6, 23]
-            if frequency == "hourly":
-                attributes.extend(["hour", "minute"])
-                max_vals.append([23, 59])
-            for a, m in zip(attributes, max_vals):
-                new_row[a] = new_row["timestamp"].dt.__getattribute__(a)
-                new_row[a + "_sin"] = np.sin(2 * np.pi * new_row[a] / m)
-                new_row[a + "_cos"] = np.cos(2 * np.pi * new_row[a] / m)
-            new_row["week"] = new_row["timestamp"].dt.isocalendar().week
-            new_row["week_sin"] = np.sin(2 * np.pi * new_row["week"] / 52)
-            new_row["week_cos"] = np.cos(2 * np.pi * new_row["week"] / 52)
-            direct_forecast = forecast_model.predict(
-                new_row.drop(["timestamp", "pm2_5"]).values.reshape(1, -1)
-            )[0]
-            new_row["pm2_5_lower"] = lower_quantile_model.predict(
-                new_row.drop(["timestamp", "pm2_5"]).values.reshape(1, -1)
-            )[0]
-            new_row["pm2_5_upper"] = upper_quantile_model.predict(
-                new_row.drop(["timestamp", "pm2_5"]).values.reshape(1, -1)
-            )[0]
-            new_row["margin_of_error"] = (
-                new_row["pm2_5_upper"] - new_row["pm2_5_lower"]
-            ) / 2
-            new_row["pm2_5"] = direct_forecast + new_row["margin_of_error"]
-
-            return new_row
+            """This method generates forecasts for a given device dataframe basing on horizon provided"""
+            df_tmp = df.copy()
+            for i in range(int(horizon)):
+                    df_tmp = pd.concat([df_tmp, df.iloc[-1]], ignore_index=True)
+                    similar_columns = ['site_id', 'device_id', 'device_category', 'latitude', 'longitude']
+                    for col in similar_columns:
+                        df_tmp.iloc[-1, df_tmp.columns.get_loc(col)] = df_tmp.iloc[-2, df_tmp.columns.get_loc(col)]
+
+                    #daily frequency
+                    if frequency == 'daily':
+                        df_tmp.iloc[-1, df_tmp.columns.get_loc('timestamp')] = df.iloc[-2, df_tmp.columns.get_loc('timestamp')] + pd.Timedelta(days=1)
+
+                        #lag features
+                        shifts1 = [1,2,3,7,14]
+                        for s in shifts1:
+                            df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_day")] = df_tmp['pm2_5'].shift(s)
+
+                        #rolling features
+                        shifts2 = [2,3,7,14]
+                        functions = ['mean', 'std', 'max', 'min']
+                        for s in shifts2:
+                            for f in functions:
+                                df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_day")] = df_tmp['pm2_5'].shift(1).rolling(s).agg(f)
+
+                    # hourly frequency
+                    elif frequency == 'hourly':
+                        df_tmp.iloc[-1, df_tmp.columns.get_loc('timestamp')] = df.iloc[-2, df_tmp.columns.get_loc('timestamp')] + pd.Timedelta(hours=1)
+
+                        #lag features
+                        shifts1 = [1,2,6,12]
+                        for s in shifts1:
+                            df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_hour")] = df_tmp['pm2_5'].shift(s)
+
+                        #rolling features
+                        shifts2 = [3,6,12,24]
+                        functions = ['mean', 'std', 'median', 'skew']
+                        for s in shifts2:
+                            for f in functions:
+                                df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_hour")] = df_tmp['pm2_5'].shift(1).rolling(s).agg(f)
+
+                    #time and cyclic features
+                    attributes = ['year', 'month', 'day', 'dayofweek']
+                    max_vals = [2023, 12, 30, 7]
+                    if frequency == 'hourly':
+                        attributes.append('hour')
+                        max_vals.append(23)
+                    for a, m in zip(attributes, max_vals):
+                        df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_sin")] = np.sin(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.__getattribute__(a) / m)
+                        df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_cos")] = np.cos(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.__getattribute__(a) / m)
+                    df_tmp.iloc[-1, df_tmp.columns.get_loc('week_sin')] = np.sin(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.isocalendar().week / 52)
+                    df_tmp.iloc[-1, df_tmp.columns.get_loc('week_cos')] = np.cos(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.isocalendar().week / 52)
+                    
+
+                    #make predictions
+                    df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5')] = forecast_model.predict(df_tmp.iloc[-1, df_tmp.columns != 'pm2_5' and df_tmp.columns != 'timestamp' and df_tmp.columns != 'margin_of_error' and df_tmp.columns != 'pm2_5_lower' and df_tmp.columns != 'pm2_5_upper'].values.reshape(1, -1))
+
+                    df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_lower')] = lower_quantile_model.predict(df_tmp.iloc[-1, df_tmp.columns != 'pm2_5' and df_tmp.columns != 'timestamp' and df_tmp.columns != 'margin_of_error' and df_tmp.columns != 'pm2_5_lower' and df_tmp.columns != 'pm2_5_upper'].values.reshape(1, -1))
+
+                    df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_upper')] = upper_quantile_model.predict(df_tmp.iloc[-1, df_tmp.columns != 'pm2_5' and df_tmp.columns != 'timestamp' and df_tmp.columns != 'margin_of_error' and df_tmp.columns != 'pm2_5_lower' and df_tmp.columns != 'pm2_5_upper'].values.reshape(1, -1))
+
+                    df_tmp.iloc[-1, df_tmp.columns.get_loc('margin_of_error')] = (df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_upper')] - df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_lower')]) / 2
+
+            return df_tmp.iloc[-int(horizon):, :]
 
         forecasts = pd.DataFrame()
         forecast_model = get_trained_model_from_gcs(
@@ -438,6 +431,8 @@ def get_new_row(
         upper_quantile_model = get_trained_model_from_gcs(
             project_name, bucket_name, f"{frequency}_upper_quantile_model.pkl"
         )
+
+
         df_tmp = data.copy()
         for device in df_tmp["device_id"].unique():
             test_copy = df_tmp[df_tmp["device_id"] == device]
@@ -446,19 +441,17 @@ def get_new_row(
                 if frequency == "hourly"
                 else configuration.DAILY_FORECAST_HORIZON
             )
-            for i in range(int(horizon)):
-                new_row = get_new_row(
+            device_forecasts = get_forecasts(
                     test_copy,
-                    device,
                     forecast_model,
                     lower_quantile_model,
                     upper_quantile_model,
                     frequency,
+                    horizon,
                 )
-                test_copy = pd.concat(
-                    [test_copy, new_row.to_frame().T], ignore_index=True
-                )
-            forecasts = pd.concat([forecasts, test_copy], ignore_index=True)
+
+            forecasts = pd.concat([forecasts, device_forecasts], ignore_index=True)
+
 
         forecasts["pm2_5"] = forecasts["pm2_5"].astype(float)
         forecasts["pm2_5_lower"] = forecasts["pm2_5_lower"].astype(float)
@@ -477,6 +470,8 @@ def get_new_row(
                 "site_id",
             ]
         ][forecasts["time"] >= current_time_utc]
+
+        decode_categorical_features(result, frequency)
         return result
 
     @staticmethod

From df858d5b771da4cc5714bc6b28cacbadf0a0dd33 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Wed, 23 Aug 2023 02:08:08 +0300
Subject: [PATCH 10/43] fix issues in new method for making forecasts

---
 src/airflow/airqo_etl_utils/ml_utils.py | 238 +++++++++++++++---------
 src/airflow/dags/ml_prediction_jobs.py  |  11 +-
 src/airflow/dags/ml_training_jobs.py    |   9 +-
 3 files changed, 164 insertions(+), 94 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index c13ba9d7c8..fcec50342e 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -59,6 +59,7 @@ def get_mapping_from_gcs(project_name, bucket_name, source_blob_name):
         mapping_dict = json.load(f)
     return mapping_dict
 
+
 def decode_categorical_features(df, frequency):
     columns = ["device_id", "site_id", "device_category"]
     for col in columns:
@@ -74,6 +75,7 @@ def decode_categorical_features(df, frequency):
         df[col] = df[col].map(mapping)
     return df
 
+
 class ForecastUtils:
     @staticmethod
     def preprocess_data(data, frequency):
@@ -176,8 +178,6 @@ def get_time_and_cyclic_features(df, freq):
             df1.drop(columns=attributes + ["week"], inplace=True)
             return df1
 
-
-
         df_tmp = data.copy()
         df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"])
         df_tmp = get_lag_features(df_tmp, target_column, frequency)
@@ -186,10 +186,10 @@ def get_time_and_cyclic_features(df, freq):
             df_tmp = encode_categorical_features(df_tmp, frequency)
         elif job_type == "predict":
             df_tmp = decode_categorical_features(df_tmp, frequency)
-            #convert the categorical columns to int
-            df_tmp['device_id'] = df_tmp['device_id'].astype(int)
-            df_tmp['site_id'] = df_tmp['site_id'].astype(int)
-            df_tmp['device_category'] = df_tmp['device_category'].astype(int)
+            df_tmp.dropna(subset=["device_id", "site_id", "device_category"], inplace=True)
+            df_tmp["device_id"] = df_tmp["device_id"].astype(int)
+            df_tmp["site_id"] = df_tmp["site_id"].astype(int)
+            df_tmp["device_category"] = df_tmp["device_category"].astype(int)
 
         return df_tmp
 
@@ -313,7 +313,10 @@ def objective(trial):
 
         alphas = [0.025, 0.975]
         models = []
-        names = [f'{frequency}_lower_quantile_model', f'{frequency}_upper_quantile_model']
+        names = [
+            f"{frequency}_lower_quantile_model",
+            f"{frequency}_upper_quantile_model",
+        ]
 
         for alpha in alphas:
             clf = LGBMRegressor(
@@ -337,15 +340,10 @@ def objective(trial):
             )
             models.append(clf)
         for n, m in zip(names, models):
-            upload_trained_model_to_gcs(
-                m, project_id, bucket, f"{n}.pkl"
-            )
-
+            upload_trained_model_to_gcs(m, project_id, bucket, f"{n}.pkl")
 
     @staticmethod
-    def generate_forecasts(
-        data, project_name, bucket_name, frequency
-    ):
+    def generate_forecasts(data, project_name, bucket_name, frequency):
         data["timestamp"] = pd.to_datetime(data["timestamp"])
         data["pm2_5_lower"] = data["pm2_5_upper"] = data["margin_of_error"] = 0
 
@@ -355,84 +353,155 @@ def get_forecasts(
             lower_quantile_model,
             upper_quantile_model,
             frequency,
-            horizon
+            horizon,
         ):
             """This method generates forecasts for a given device dataframe basing on horizon provided"""
             df_tmp = df.copy()
             for i in range(int(horizon)):
-                    df_tmp = pd.concat([df_tmp, df.iloc[-1]], ignore_index=True)
-                    similar_columns = ['site_id', 'device_id', 'device_category', 'latitude', 'longitude']
-                    for col in similar_columns:
-                        df_tmp.iloc[-1, df_tmp.columns.get_loc(col)] = df_tmp.iloc[-2, df_tmp.columns.get_loc(col)]
-
-                    #daily frequency
-                    if frequency == 'daily':
-                        df_tmp.iloc[-1, df_tmp.columns.get_loc('timestamp')] = df.iloc[-2, df_tmp.columns.get_loc('timestamp')] + pd.Timedelta(days=1)
-
-                        #lag features
-                        shifts1 = [1,2,3,7,14]
-                        for s in shifts1:
-                            df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_day")] = df_tmp['pm2_5'].shift(s)
-
-                        #rolling features
-                        shifts2 = [2,3,7,14]
-                        functions = ['mean', 'std', 'max', 'min']
-                        for s in shifts2:
-                            for f in functions:
-                                df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_day")] = df_tmp['pm2_5'].shift(1).rolling(s).agg(f)
-
-                    # hourly frequency
-                    elif frequency == 'hourly':
-                        df_tmp.iloc[-1, df_tmp.columns.get_loc('timestamp')] = df.iloc[-2, df_tmp.columns.get_loc('timestamp')] + pd.Timedelta(hours=1)
-
-                        #lag features
-                        shifts1 = [1,2,6,12]
-                        for s in shifts1:
-                            df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_hour")] = df_tmp['pm2_5'].shift(s)
-
-                        #rolling features
-                        shifts2 = [3,6,12,24]
-                        functions = ['mean', 'std', 'median', 'skew']
-                        for s in shifts2:
-                            for f in functions:
-                                df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_hour")] = df_tmp['pm2_5'].shift(1).rolling(s).agg(f)
-
-                    #time and cyclic features
-                    attributes = ['year', 'month', 'day', 'dayofweek']
-                    max_vals = [2023, 12, 30, 7]
-                    if frequency == 'hourly':
-                        attributes.append('hour')
-                        max_vals.append(23)
-                    for a, m in zip(attributes, max_vals):
-                        df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_sin")] = np.sin(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.__getattribute__(a) / m)
-                        df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_cos")] = np.cos(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.__getattribute__(a) / m)
-                    df_tmp.iloc[-1, df_tmp.columns.get_loc('week_sin')] = np.sin(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.isocalendar().week / 52)
-                    df_tmp.iloc[-1, df_tmp.columns.get_loc('week_cos')] = np.cos(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.isocalendar().week / 52)
-                    
-
-                    #make predictions
-                    df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5')] = forecast_model.predict(df_tmp.iloc[-1, df_tmp.columns != 'pm2_5' and df_tmp.columns != 'timestamp' and df_tmp.columns != 'margin_of_error' and df_tmp.columns != 'pm2_5_lower' and df_tmp.columns != 'pm2_5_upper'].values.reshape(1, -1))
-
-                    df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_lower')] = lower_quantile_model.predict(df_tmp.iloc[-1, df_tmp.columns != 'pm2_5' and df_tmp.columns != 'timestamp' and df_tmp.columns != 'margin_of_error' and df_tmp.columns != 'pm2_5_lower' and df_tmp.columns != 'pm2_5_upper'].values.reshape(1, -1))
-
-                    df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_upper')] = upper_quantile_model.predict(df_tmp.iloc[-1, df_tmp.columns != 'pm2_5' and df_tmp.columns != 'timestamp' and df_tmp.columns != 'margin_of_error' and df_tmp.columns != 'pm2_5_lower' and df_tmp.columns != 'pm2_5_upper'].values.reshape(1, -1))
-
-                    df_tmp.iloc[-1, df_tmp.columns.get_loc('margin_of_error')] = (df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_upper')] - df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_lower')]) / 2
-
-            return df_tmp.iloc[-int(horizon):, :]
+                df_tmp = pd.concat([df_tmp, df_tmp.iloc[-1:]],ignore_index=True)
+                similar_columns = [
+                    "site_id",
+                    "device_id",
+                    "device_category",
+                    "latitude",
+                    "longitude",
+                ]
+                for col in similar_columns:
+                    df_tmp.iloc[-1, df_tmp.columns.get_loc(col)] = df_tmp.iloc[
+                        -2, df_tmp.columns.get_loc(col)
+                    ]
+
+                # daily frequency
+                if frequency == "daily":
+                    df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df.iloc[
+                        -2, df_tmp.columns.get_loc("timestamp")
+                    ] + pd.Timedelta(days=1)
+
+                    # lag features
+                    shifts1 = [1, 2, 3, 7, 14]
+                    for s in shifts1:
+                        df_tmp.iloc[
+                            -1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_day")
+                        ] = df_tmp["pm2_5"].shift(s)
+
+                    # rolling features
+                    shifts2 = [2, 3, 7, 14]
+                    functions = ["mean", "std", "max", "min"]
+                    for s in shifts2:
+                        for f in functions:
+                            df_tmp.iloc[
+                                -1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_day")
+                            ] = (df_tmp["pm2_5"].shift(1).rolling(s).agg(f))
+
+                # hourly frequency
+                elif frequency == "hourly":
+                    df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df.iloc[
+                        -2, df_tmp.columns.get_loc("timestamp")
+                    ] + pd.Timedelta(hours=1)
+
+                    # lag features
+                    shifts1 = [1, 2, 6, 12]
+                    for s in shifts1:
+                        df_tmp.iloc[
+                            -1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_hour")
+                        ] = df_tmp["pm2_5"].shift(s)
+
+                    # rolling features
+                    shifts2 = [3, 6, 12, 24]
+                    functions = ["mean", "std", "median", "skew"]
+                    for s in shifts2:
+                        for f in functions:
+                            df_tmp.iloc[
+                                -1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_hour")
+                            ] = (df_tmp["pm2_5"].shift(1).rolling(s).agg(f))
+
+                # time and cyclic features
+                attributes = ["year", "month", "day", "dayofweek"]
+                max_vals = [2023, 12, 30, 7]
+                if frequency == "hourly":
+                    attributes.append("hour")
+                    max_vals.append(23)
+                for a, m in zip(attributes, max_vals):
+                    df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_sin")] = np.sin(
+                        2
+                        * np.pi
+                        * df_tmp[
+                            -1, df_tmp.columns.get_loc("timestamp")
+                        ].dt.__getattribute__(a)
+                        / m
+                    )
+                    df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_cos")] = np.cos(
+                        2
+                        * np.pi
+                        * df_tmp[
+                            -1, df_tmp.columns.get_loc("timestamp")
+                        ].dt.__getattribute__(a)
+                        / m
+                    )
+                df_tmp.iloc[-1, df_tmp.columns.get_loc("week_sin")] = np.sin(
+                    2
+                    * np.pi
+                    * df_tmp[-1, df_tmp.columns.get_loc("timestamp")]
+                    .dt.isocalendar()
+                    .week
+                    / 52
+                )
+                df_tmp.iloc[-1, df_tmp.columns.get_loc("week_cos")] = np.cos(
+                    2
+                    * np.pi
+                    * df_tmp[-1, df_tmp.columns.get_loc("timestamp")]
+                    .dt.isocalendar()
+                    .week
+                    / 52
+                )
+
+                # make predictions
+                excluded_columns = ["pm2_5", "timestamp", "margin_of_error", "pm2_5_lower", "pm2_5_upper"]
+                df_tmp.iloc[
+                    -1, df_tmp.columns.get_loc("pm2_5")
+                ] = forecast_model.predict(
+                    df_tmp.iloc[
+                        -1,
+                        df_tmp.columns not in excluded_columns,
+                    ].values.reshape(1, -1)
+                )
+
+                df_tmp.iloc[
+                    -1, df_tmp.columns.get_loc("pm2_5_lower")
+                ] = lower_quantile_model.predict(
+                    df_tmp.iloc[
+                        -1,
+                      df_tmp.columns not in excluded_columns,
+                    ].values.reshape(1, -1)
+                )
+
+                df_tmp.iloc[
+                    -1, df_tmp.columns.get_loc("pm2_5_upper")
+                ] = upper_quantile_model.predict(
+                    df_tmp.iloc[
+                        -1,
+                        df_tmp.columns not in excluded_columns,
+                    ].values.reshape(1, -1)
+                )
+
+                df_tmp.iloc[-1, df_tmp.columns.get_loc("margin_of_error")] = (
+                    df_tmp.iloc[-1, df_tmp.columns.get_loc("pm2_5_upper")]
+                    - df_tmp.iloc[-1, df_tmp.columns.get_loc("pm2_5_lower")]
+                ) / 2
+
+            return df_tmp.iloc[-int(horizon) :, :]
 
         forecasts = pd.DataFrame()
         forecast_model = get_trained_model_from_gcs(
             project_name, bucket_name, f"{frequency}_forecast_model.pkl"
         )
         lower_quantile_model = get_trained_model_from_gcs(
-            project_name, bucket_name,  f"{frequency}_lower_quantile_model.pkl"
+            project_name, bucket_name, f"{frequency}_lower_quantile_model.pkl"
         )
         upper_quantile_model = get_trained_model_from_gcs(
             project_name, bucket_name, f"{frequency}_upper_quantile_model.pkl"
         )
 
-
         df_tmp = data.copy()
         for device in df_tmp["device_id"].unique():
             test_copy = df_tmp[df_tmp["device_id"] == device]
@@ -442,17 +511,16 @@ def get_forecasts(
                 else configuration.DAILY_FORECAST_HORIZON
             )
             device_forecasts = get_forecasts(
-                    test_copy,
-                    forecast_model,
-                    lower_quantile_model,
-                    upper_quantile_model,
-                    frequency,
-                    horizon,
-                )
+                test_copy,
+                forecast_model,
+                lower_quantile_model,
+                upper_quantile_model,
+                frequency,
+                horizon,
+            )
 
             forecasts = pd.concat([forecasts, device_forecasts], ignore_index=True)
 
-
         forecasts["pm2_5"] = forecasts["pm2_5"].astype(float)
         forecasts["pm2_5_lower"] = forecasts["pm2_5_lower"].astype(float)
         forecasts["pm2_5_upper"] = forecasts["pm2_5_upper"].astype(float)
diff --git a/src/airflow/dags/ml_prediction_jobs.py b/src/airflow/dags/ml_prediction_jobs.py
index f90233afa4..04972fccd6 100644
--- a/src/airflow/dags/ml_prediction_jobs.py
+++ b/src/airflow/dags/ml_prediction_jobs.py
@@ -35,11 +35,13 @@ def preprocess_historical_data_hourly_forecast(data):
 
     @task
     def feature_eng_hourly_historical_data(data):
-        return ForecastUtils.feature_eng_data(data, 'pm2_5', 'hourly', 'predict')
+        return ForecastUtils.feature_eng_data(data, "pm2_5", "hourly", "predict")
 
     @task()
     def make_hourly_forecasts(data):
-        return ForecastUtils.generate_forecasts(data=data, project_name=project_id, bucket_name= bucket,frequency='hourly')
+        return ForecastUtils.generate_forecasts(
+            data=data, project_name=project_id, bucket_name=bucket, frequency="hourly"
+        )
 
     @task()
     def save_hourly_forecasts_to_bigquery(data):
@@ -69,12 +71,11 @@ def preprocess_historical_data_daily_forecast(data):
 
     @task()
     def feature_engineer_daily_historical_data(data):
-        return ForecastUtils.feature_eng_data(data, 'pm2_5', 'daily', 'predict')
+        return ForecastUtils.feature_eng_data(data, "pm2_5", "daily", "predict")
 
     @task()
     def make_daily_forecasts(data):
-        return ForecastUtils.generate_forecasts(data, project_id, bucket, 'daily')
-
+        return ForecastUtils.generate_forecasts(data, project_id, bucket, "daily")
 
     @task()
     def save_daily_forecasts_to_bigquery(data):
diff --git a/src/airflow/dags/ml_training_jobs.py b/src/airflow/dags/ml_training_jobs.py
index 32287ee59c..836441c52a 100644
--- a/src/airflow/dags/ml_training_jobs.py
+++ b/src/airflow/dags/ml_training_jobs.py
@@ -15,7 +15,6 @@
     tags=["airqo", "hourly-forecast", "daily-forecast", "training-job"],
 )
 def train_forecasting_models():
-
     # Hourly forecast tasks
     @task()
     def fetch_training_data_for_hourly_forecast_model():
@@ -28,6 +27,7 @@ def fetch_training_data_for_hourly_forecast_model():
         )
         start_date = date_to_str(start_date, str_format="%Y-%m-%d")
         return BigQueryApi().fetch_data(start_date)
+
     @task()
     def preprocess_training_data_for_hourly_forecast_model(data):
         return ForecastUtils.preprocess_data(data, "hourly")
@@ -38,10 +38,11 @@ def feature_engineer_training_data_for_hourly_forecast_model(data):
 
     @task()
     def train_and_save_hourly_forecast_model(train_data):
-        return ForecastUtils.train_and_save_forecast_models(train_data, frequency='hourly')
-
+        return ForecastUtils.train_and_save_forecast_models(
+            train_data, frequency="hourly"
+        )
 
-# Daily forecast tasks
+    # Daily forecast tasks
     @task()
     def fetch_training_data_for_daily_forecast_model():
         from dateutil.relativedelta import relativedelta

From 169222f807e4fe627278eed81e76e08eae702760 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Wed, 23 Aug 2023 10:58:37 +0300
Subject: [PATCH 11/43] Update ml_utils.py

---
 src/airflow/airqo_etl_utils/ml_utils.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index fcec50342e..475506c650 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -380,19 +380,17 @@ def get_forecasts(
                     # lag features
                     shifts1 = [1, 2, 3, 7, 14]
                     for s in shifts1:
-                        df_tmp.iloc[
-                            -1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_day")
-                        ] = df_tmp["pm2_5"].shift(s)
+                        df_tmp[f"pm2_5_last_{s}_day"] = df_tmp.shift(s, axis=0)["pm2_5"]
+
 
                     # rolling features
                     shifts2 = [2, 3, 7, 14]
                     functions = ["mean", "std", "max", "min"]
                     for s in shifts2:
                         for f in functions:
-                            df_tmp.iloc[
-                                -1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_day")
-                            ] = (df_tmp["pm2_5"].shift(1).rolling(s).agg(f))
+                            df_tmp[f"pm2_5_{f}_{s}_day"] = (df_tmp.shift(1, axis=0).rolling(s).agg(f))["pm2_5"]
 
+                    print('done')
                 # hourly frequency
                 elif frequency == "hourly":
                     df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df.iloc[
@@ -402,20 +400,17 @@ def get_forecasts(
                     # lag features
                     shifts1 = [1, 2, 6, 12]
                     for s in shifts1:
-                        df_tmp.iloc[
-                            -1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_hour")
-                        ] = df_tmp["pm2_5"].shift(s)
+                        df_tmp[f"pm2_5_last_{s}_hour"] = df_tmp.shift(s, axis=0)["pm2_5"]
+
 
                     # rolling features
                     shifts2 = [3, 6, 12, 24]
                     functions = ["mean", "std", "median", "skew"]
                     for s in shifts2:
                         for f in functions:
-                            df_tmp.iloc[
-                                -1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_hour")
-                            ] = (df_tmp["pm2_5"].shift(1).rolling(s).agg(f))
+                            df_tmp[f"pm2_5_{f}_{s}_hour"] = (df_tmp.shift(1, axis=0).rolling(s).agg(f))["pm2_5"]
+
 
-                # time and cyclic features
                 attributes = ["year", "month", "day", "dayofweek"]
                 max_vals = [2023, 12, 30, 7]
                 if frequency == "hourly":

From 167261c4698268f05b1d38e3b12654494bcf4e43 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Thu, 24 Aug 2023 01:07:34 +0300
Subject: [PATCH 12/43] fix forecast generation method

---
 src/airflow/airqo_etl_utils/ml_utils.py | 121 +++++++-----------------
 1 file changed, 34 insertions(+), 87 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index 475506c650..e5a6c99445 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -1,6 +1,6 @@
 import json
 import random
-from datetime import datetime
+from datetime import datetime, timedelta
 
 import gcsfs
 import joblib
@@ -19,6 +19,7 @@
 bucket = configuration.FORECAST_MODELS_BUCKET
 environment = configuration.ENVIRONMENT
 
+pd.options.mode.chained_assignment = None
 
 def get_trained_model_from_gcs(project_name, bucket_name, source_blob_name):
     fs = gcsfs.GCSFileSystem(project=project_name)
@@ -345,10 +346,11 @@ def objective(trial):
     @staticmethod
     def generate_forecasts(data, project_name, bucket_name, frequency):
         data["timestamp"] = pd.to_datetime(data["timestamp"])
-        data["pm2_5_lower"] = data["pm2_5_upper"] = data["margin_of_error"] = 0
+        data.columns = data.columns.str.strip()
+        data["pm2_5_lower"] = data["pm2_5_upper"] = data["margin_of_error"] = data["adjusted_forecast"] = 0
 
         def get_forecasts(
-            df,
+            df_tmp,
             forecast_model,
             lower_quantile_model,
             upper_quantile_model,
@@ -356,44 +358,31 @@ def get_forecasts(
             horizon,
         ):
             """This method generates forecasts for a given device dataframe basing on horizon provided"""
-            df_tmp = df.copy()
             for i in range(int(horizon)):
-                df_tmp = pd.concat([df_tmp, df_tmp.iloc[-1:]],ignore_index=True)
-                similar_columns = [
-                    "site_id",
-                    "device_id",
-                    "device_category",
-                    "latitude",
-                    "longitude",
-                ]
-                for col in similar_columns:
-                    df_tmp.iloc[-1, df_tmp.columns.get_loc(col)] = df_tmp.iloc[
-                        -2, df_tmp.columns.get_loc(col)
-                    ]
-
+                df_tmp = pd.concat([df_tmp, df_tmp.iloc[-1:]], ignore_index=True)
+                df_tmp_no_ts = df_tmp.drop("timestamp", axis=1, inplace=False)
                 # daily frequency
                 if frequency == "daily":
-                    df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df.iloc[
-                        -2, df_tmp.columns.get_loc("timestamp")
-                    ] + pd.Timedelta(days=1)
-
-                    # lag features
+                    df_tmp.tail(1)['timestamp'] += timedelta(days=1)
                     shifts1 = [1, 2, 3, 7, 14]
                     for s in shifts1:
                         df_tmp[f"pm2_5_last_{s}_day"] = df_tmp.shift(s, axis=0)["pm2_5"]
-
-
                     # rolling features
                     shifts2 = [2, 3, 7, 14]
                     functions = ["mean", "std", "max", "min"]
+                     #review this
                     for s in shifts2:
                         for f in functions:
-                            df_tmp[f"pm2_5_{f}_{s}_day"] = (df_tmp.shift(1, axis=0).rolling(s).agg(f))["pm2_5"]
+                            df_tmp[f"pm2_5_{f}_{s}_day"] = (df_tmp_no_ts.shift(1, axis=0).rolling(s).agg(f))[
+                                "pm2_5"
+                            ]
 
                     print('done')
-                # hourly frequency
+
+
                 elif frequency == "hourly":
-                    df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df.iloc[
+                    df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df_tmp.iloc[
+
                         -2, df_tmp.columns.get_loc("timestamp")
                     ] + pd.Timedelta(hours=1)
 
@@ -417,72 +406,28 @@ def get_forecasts(
                     attributes.append("hour")
                     max_vals.append(23)
                 for a, m in zip(attributes, max_vals):
-                    df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_sin")] = np.sin(
-                        2
-                        * np.pi
-                        * df_tmp[
-                            -1, df_tmp.columns.get_loc("timestamp")
-                        ].dt.__getattribute__(a)
-                        / m
-                    )
-                    df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_cos")] = np.cos(
-                        2
-                        * np.pi
-                        * df_tmp[
-                            -1, df_tmp.columns.get_loc("timestamp")
-                        ].dt.__getattribute__(a)
-                        / m
-                    )
-                df_tmp.iloc[-1, df_tmp.columns.get_loc("week_sin")] = np.sin(
-                    2
-                    * np.pi
-                    * df_tmp[-1, df_tmp.columns.get_loc("timestamp")]
-                    .dt.isocalendar()
-                    .week
-                    / 52
-                )
-                df_tmp.iloc[-1, df_tmp.columns.get_loc("week_cos")] = np.cos(
-                    2
-                    * np.pi
-                    * df_tmp[-1, df_tmp.columns.get_loc("timestamp")]
-                    .dt.isocalendar()
-                    .week
-                    / 52
-                )
+                    df_tmp.tail(1)[f"{a}_sin"] = np.sin(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.__getattribute__(a) / m)
+                    df_tmp.tail(1)[f"{a}_cos"] = np.cos(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.__getattribute__(a) / m)
+                df_tmp.tail(1)["week_sin"] = np.sin(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.isocalendar().week / 52)
+                df_tmp.tail(1)["week_cos"] = np.cos(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.isocalendar().week / 52)
 
                 # make predictions
-                excluded_columns = ["pm2_5", "timestamp", "margin_of_error", "pm2_5_lower", "pm2_5_upper"]
-                df_tmp.iloc[
-                    -1, df_tmp.columns.get_loc("pm2_5")
-                ] = forecast_model.predict(
-                    df_tmp.iloc[
-                        -1,
-                        df_tmp.columns not in excluded_columns,
-                    ].values.reshape(1, -1)
+                excluded_columns = ["pm2_5", "timestamp", "margin_of_error", "pm2_5_lower", "pm2_5_upper", "adjusted_forecast"]
+                print(df_tmp.tail(1))
+                # df_tmp.tail(1)['pm2_5'] = forecast_model.predict(df_tmp.tail(1).drop(excluded_columns).values.reshape(1, -1))
+                df_tmp.loc[df_tmp.index[-1], "pm2_5"] = forecast_model.predict(
+                    df_tmp.drop(excluded_columns, axis=1).tail(1).values.reshape(1, -1)
                 )
-
-                df_tmp.iloc[
-                    -1, df_tmp.columns.get_loc("pm2_5_lower")
-                ] = lower_quantile_model.predict(
-                    df_tmp.iloc[
-                        -1,
-                      df_tmp.columns not in excluded_columns,
-                    ].values.reshape(1, -1)
+                df_tmp.loc[df_tmp.index[-1], "pm2_5_lower"] = lower_quantile_model.predict(
+                    df_tmp.drop(excluded_columns, axis=1).tail(1).values.reshape(1, -1)
                 )
-
-                df_tmp.iloc[
-                    -1, df_tmp.columns.get_loc("pm2_5_upper")
-                ] = upper_quantile_model.predict(
-                    df_tmp.iloc[
-                        -1,
-                        df_tmp.columns not in excluded_columns,
-                    ].values.reshape(1, -1)
+                df_tmp.loc[df_tmp.index[-1], "pm2_5_upper"] = upper_quantile_model.predict(
+                    df_tmp.drop(excluded_columns, axis=1).tail(1).values.reshape(1, -1)
                 )
+                df_tmp.loc[df_tmp.index[-1], "margin_of_error"] = (df_tmp.loc[df_tmp.index[-1], "pm2_5_upper"] - df_tmp.loc[df_tmp.index[-1], "pm2_5_lower"])/2
+                df_tmp.loc[df_tmp.index[-1], "adjusted_forecast"] = df_tmp.loc[df_tmp.index[-1], "pm2_5"] + df_tmp.loc[df_tmp.index[-1], "margin_of_error"]
+
 
-                df_tmp.iloc[-1, df_tmp.columns.get_loc("margin_of_error")] = (
-                    df_tmp.iloc[-1, df_tmp.columns.get_loc("pm2_5_upper")]
-                    - df_tmp.iloc[-1, df_tmp.columns.get_loc("pm2_5_lower")]
-                ) / 2
 
             return df_tmp.iloc[-int(horizon) :, :]
 
@@ -515,6 +460,8 @@ def get_forecasts(
             )
 
             forecasts = pd.concat([forecasts, device_forecasts], ignore_index=True)
+            print(device)
+
 
         forecasts["pm2_5"] = forecasts["pm2_5"].astype(float)
         forecasts["pm2_5_lower"] = forecasts["pm2_5_lower"].astype(float)

From 31e9ec142cb0a03dc38e5ff4a7ca2fd83da84c5f Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Thu, 24 Aug 2023 02:25:10 +0300
Subject: [PATCH 13/43] Update ml_utils.py

---
 src/airflow/airqo_etl_utils/ml_utils.py | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index e5a6c99445..b4e6b59dab 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -370,7 +370,6 @@ def get_forecasts(
                     # rolling features
                     shifts2 = [2, 3, 7, 14]
                     functions = ["mean", "std", "max", "min"]
-                     #review this
                     for s in shifts2:
                         for f in functions:
                             df_tmp[f"pm2_5_{f}_{s}_day"] = (df_tmp_no_ts.shift(1, axis=0).rolling(s).agg(f))[
@@ -467,22 +466,8 @@ def get_forecasts(
         forecasts["pm2_5_lower"] = forecasts["pm2_5_lower"].astype(float)
         forecasts["pm2_5_upper"] = forecasts["pm2_5_upper"].astype(float)
         forecasts["margin_of_error"] = forecasts["margin_of_error"].astype(float)
-        current_time_utc = pd.Timestamp(datetime.utcnow(), tz="UTC")
-        forecasts.rename(columns={"timestamp": "time"}, inplace=True)
-        result = forecasts[
-            [
-                "timestamp",
-                "pm2_5",
-                "pm2_5_lower",
-                "pm2_5_upper",
-                "margin_of_error",
-                "device_id",
-                "site_id",
-            ]
-        ][forecasts["time"] >= current_time_utc]
-
-        decode_categorical_features(result, frequency)
-        return result
+        decode_categorical_features(forecasts, frequency)
+        return forecasts
 
     @staticmethod
     def save_forecasts_to_mongo(data, frequency):

From 4dfa9acf8faa0b883690c0d993e672b966ec4366 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Fri, 25 Aug 2023 23:18:34 +0300
Subject: [PATCH 14/43] refactor endpoints

---
 src/predict/api/dump.rdb      | Bin 89 -> 0 bytes
 src/predict/api/helpers.py    |  14 +++++++++-----
 src/predict/api/prediction.py |  13 +++++++++++++
 3 files changed, 22 insertions(+), 5 deletions(-)
 delete mode 100644 src/predict/api/dump.rdb

diff --git a/src/predict/api/dump.rdb b/src/predict/api/dump.rdb
deleted file mode 100644
index 3890d66e38ccf5adc5d018cc4ea4f93fb47d027f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 89
zcmWG?b@2=~FfcUu#aWb^l3A=<mRiJSu4kZUX!MH<B9fF@QhdPS7i)4!W^U@C_j4|!
q{NgArPEFCxP0c-2@K=E07e``#nr>2Jaq0nv{|u`R$>lYFJ`4clZ6wnG

diff --git a/src/predict/api/helpers.py b/src/predict/api/helpers.py
index b9edb0682b..baa1777d93 100644
--- a/src/predict/api/helpers.py
+++ b/src/predict/api/helpers.py
@@ -85,8 +85,8 @@ def geo_coordinates_cache_key():
 def get_health_tips() -> list[dict]:
     try:
         response = requests.get(
-            f"{Config.AIRQO_BASE_URL}/api/v2/devices/tips?token={Config.AIRQO_API_AUTH_TOKEN}",
-            timeout=3,
+            f"{Config.AIRQO_BASE_URL}api/v2/devices/tips?token={Config.AIRQO_API_AUTH_TOKEN}",
+            timeout=10,
         )
         result = response.json()
         return result["tips"]
@@ -186,6 +186,7 @@ def get_predictions_by_geo_coordinates_v2(latitude: float, longitude: float) ->
 @cache.memoize(timeout=Config.CACHE_TIMEOUT)
 def get_forecasts(
     db_name,
+        device_id=None,
     site_id=None,
     site_name=None,
     parish=None,
@@ -196,6 +197,7 @@ def get_forecasts(
 ):
     query = {}
     params = {
+        "device_id": device_id,
         "site_id": site_id,
         "site_name": site_name,
         "parish": parish,
@@ -213,14 +215,16 @@ def get_forecasts(
 
     results = []
     if site_forecasts:
-        for time, pm2_5, health_tips in zip(
-            site_forecasts[0]["time"],
+        for time, pm2_5, margin_of_error, adjusted_forecast in zip(
+            site_forecasts[0]["timestamp"],
             site_forecasts[0]["pm2_5"],
+                site_forecasts[0]["margin_of_error"],
+                site_forecasts[0]["adjusted_forecast"],
         ):
             result = {
                 key: value
                 for key, value in zip(
-                    ["time", "pm2_5"], [time, pm2_5]
+                    ["time", "pm2_5", "margin_of_error", "adjusted_forecast"], [time, pm2_5, margin_of_error, adjusted_forecast]
                 )
             }
             results.append(result)
diff --git a/src/predict/api/prediction.py b/src/predict/api/prediction.py
index 97335f31a6..068ae4ce48 100644
--- a/src/predict/api/prediction.py
+++ b/src/predict/api/prediction.py
@@ -124,6 +124,7 @@ def get_next_1_week_forecasts():
     params = {
         name: request.args.get(name, default=None, type=str)
         for name in [
+            "device_id",
             "site_id",
             "site_name",
             "parish",
@@ -145,6 +146,18 @@ def get_next_1_week_forecasts():
         )
     result = get_forecasts(**params, db_name="daily_forecasts")
     if result:
+        health_tips = get_health_tips()
+        for forecast in result["forecasts"]:
+            pm2_5 = forecast["pm2_5"]
+            forecast["health_tips"] = list(
+                filter(
+                    lambda x: x["aqi_category"]["max"]
+                              >= pm2_5
+                              >= x["aqi_category"]["min"],
+                    health_tips,
+                )
+            )
+
         response = result
     else:
         response = {

From b31a0fb61edb1e40805ffd01865b378335663d76 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Sat, 26 Aug 2023 01:28:51 +0300
Subject: [PATCH 15/43] code cleanup

---
 src/airflow/airqo_etl_utils/ml_utils.py | 437 +++++++++++++++---------
 src/airflow/dags/ml_training_jobs.py    |  13 +-
 src/airflow/requirements.txt            |   2 +-
 src/predict/api/helpers.py              |   9 +-
 src/predict/api/prediction.py           |   8 +-
 5 files changed, 290 insertions(+), 179 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index b4e6b59dab..591f72e133 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -21,70 +21,115 @@
 
 pd.options.mode.chained_assignment = None
 
-def get_trained_model_from_gcs(project_name, bucket_name, source_blob_name):
-    fs = gcsfs.GCSFileSystem(project=project_name)
-    fs.ls(bucket_name)
-    with fs.open(bucket_name + "/" + source_blob_name, "rb") as handle:
-        job = joblib.load(handle)
-    return job
-
-
-def upload_trained_model_to_gcs(
-    trained_model, project_name, bucket_name, source_blob_name
-):
-    fs = gcsfs.GCSFileSystem(project=project_name)
-    try:
-        fs.rename(
-            f"{bucket_name}/{source_blob_name}",
-            f"{bucket_name}/{datetime.now()}-{source_blob_name}",
-        )
-        print("Bucket: previous model is backed up")
-    except:
-        print("Bucket: No file to updated")
 
-    # store new model
-    with fs.open(bucket_name + "/" + source_blob_name, "wb") as handle:
-        job = joblib.dump(trained_model, handle)
+class GCSUtils:
+    # TODO: In future, save and retrieve models from mlflow instead of GCS
+    @staticmethod
+    def get_trained_model_from_gcs(project_name, bucket_name, source_blob_name):
+        fs = gcsfs.GCSFileSystem(project=project_name)
+        fs.ls(bucket_name)
+        with fs.open(bucket_name + "/" + source_blob_name, "rb") as handle:
+            job = joblib.load(handle)
+        return job
 
+    @staticmethod
+    def upload_trained_model_to_gcs(
+        trained_model, project_name, bucket_name, source_blob_name
+    ):
+        fs = gcsfs.GCSFileSystem(project=project_name)
+        try:
+            fs.rename(
+                f"{bucket_name}/{source_blob_name}",
+                f"{bucket_name}/{datetime.now()}-{source_blob_name}",
+            )
+            print("Bucket: previous model is backed up")
+        except:
+            print("Bucket: No file to updated")
 
-def upload_mapping_to_gcs(mapping_dict, project_name, bucket_name, source_blob_name):
-    fs = gcsfs.GCSFileSystem(project=project_name)
-    mapping_dict = json.dumps(mapping_dict)
-    with fs.open(bucket_name + "/" + source_blob_name, "w") as f:
-        f.write(mapping_dict)
+        with fs.open(bucket_name + "/" + source_blob_name, "wb") as handle:
+            job = joblib.dump(trained_model, handle)
 
+    @staticmethod
+    def upload_mapping_to_gcs(
+        mapping_dict, project_name, bucket_name, source_blob_name
+    ):
+        fs = gcsfs.GCSFileSystem(project=project_name)
+        mapping_dict = json.dumps(mapping_dict)
+        with fs.open(bucket_name + "/" + source_blob_name, "w") as f:
+            f.write(mapping_dict)
 
-def get_mapping_from_gcs(project_name, bucket_name, source_blob_name):
-    fs = gcsfs.GCSFileSystem(project=project_name)
-    with fs.open(bucket_name + "/" + source_blob_name, "r") as f:
-        mapping_dict = json.load(f)
-    return mapping_dict
+    @staticmethod
+    def get_mapping_from_gcs(project_name, bucket_name, source_blob_name):
+        fs = gcsfs.GCSFileSystem(project=project_name)
+        with fs.open(bucket_name + "/" + source_blob_name, "r") as f:
+            mapping_dict = json.load(f)
+        return mapping_dict
 
 
-def decode_categorical_features(df, frequency):
-    columns = ["device_id", "site_id", "device_category"]
-    for col in columns:
-        if frequency == "hourly":
-            mapping = get_mapping_from_gcs(
-                project_id, bucket, f"hourly_{col}_mapping.json"
-            )
-        elif frequency == "daily":
-            mapping = get_mapping_from_gcs(
-                project_id, bucket, f"daily_{col}_mapping.json"
-            )
+class DecodingUtils:
+    @staticmethod
+    def decode_categorical_features_pred(df, frequency):
+        columns = ["device_id", "site_id", "device_category"]
+        mapping = {}
+        for col in columns:
+            if frequency == "hourly":
+                mapping = GCSUtils.get_mapping_from_gcs(
+                    project_id, bucket, f"hourly_{col}_mapping.json"
+                )
+            elif frequency == "daily":
+                mapping = GCSUtils.get_mapping_from_gcs(
+                    project_id, bucket, f"daily_{col}_mapping.json"
+                )
+            df[col] = df[col].map(mapping)
+        return df
 
-        df[col] = df[col].map(mapping)
-    return df
+    @staticmethod
+    def decode_categorical_features_before_save(df, frequency):
+        columns = ["device_id", "site_id", "device_category"]
+        mapping = {}
+        for col in columns:
+            if frequency == "hourly":
+                mapping = GCSUtils.get_mapping_from_gcs(
+                    project_id, bucket, f"hourly_{col}_mapping.json"
+                )
+            elif frequency == "daily":
+                mapping = GCSUtils.get_mapping_from_gcs(
+                    project_id, bucket, f"daily_{col}_mapping.json"
+                )
+            df[col] = df[col].map({v: k for k, v in mapping.items()})
+        return df
+
+    def encode_categorical_training_features(df, freq):
+        df1 = df.copy()
+        columns = ["device_id", "site_id", "device_category"]
+        mappings = []
+        for col in columns:
+            mapping = {}
+            for val in df1[col].unique():
+                num = random.randint(0, 1000)
+                while num in mapping.values():
+                    num = random.randint(0, 1000)
+                mapping[val] = num
+            df1[col] = df1[col].map(mapping)
+            mappings.append(mapping)
+        for i, col in enumerate(columns):
+            GCSUtils.upload_mapping_to_gcs(
+                mappings[i],
+                project_id,
+                bucket,
+                f"{freq}_{col}_mapping.json",
+            )
+        return df1
 
 
 class ForecastUtils:
     @staticmethod
-    def preprocess_data(data, frequency):
+    def preprocess_data(data, data_frequency):
         data["timestamp"] = pd.to_datetime(data["timestamp"])
         data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])[
             "pm2_5"
         ].transform(lambda x: x.interpolate(method="linear", limit_direction="both"))
-        if frequency == "daily":
+        if data_frequency == "daily":
             data = (
                 data.groupby(["device_id", "site_id", "device_category"])
                 .resample("D", on="timestamp")
@@ -100,9 +145,9 @@ def preprocess_data(data, frequency):
         return data
 
     @staticmethod
-    def feature_eng_data(data, target_column, frequency, job_type):
+    def feature_eng_data(data, target_column, data_frequency, job_type):
         def get_lag_features(df, target_col, freq):
-            df1 = df.copy()
+            df1 = df.copy()  # use copy to prevent terminal warning
             if freq == "daily":
                 shifts = [1, 2, 3, 7, 14]
                 for s in shifts:
@@ -142,25 +187,6 @@ def get_lag_features(df, target_col, freq):
 
             return df1
 
-        def encode_categorical_features(df, frequency):
-            df1 = df.copy()
-            columns = ["device_id", "site_id", "device_category"]
-            mappings = []
-            for col in columns:
-                mapping = {}
-                for val in df1[col].unique():
-                    num = random.randint(0, 10000)
-                    while num in mapping.values():
-                        num = random.randint(0, 10000)
-                    mapping[val] = num
-                df1[col] = df1[col].map(mapping)
-                mappings.append(mapping)
-            for i, col in enumerate(columns):
-                upload_mapping_to_gcs(
-                    mappings[i], project_id, bucket, f"{frequency}_{col}_mapping.json"
-                )
-            return df1
-
         def get_time_and_cyclic_features(df, freq):
             df1 = df.copy()
             attributes = ["year", "month", "day", "dayofweek"]
@@ -181,13 +207,20 @@ def get_time_and_cyclic_features(df, freq):
 
         df_tmp = data.copy()
         df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"])
-        df_tmp = get_lag_features(df_tmp, target_column, frequency)
-        df_tmp = get_time_and_cyclic_features(df_tmp, frequency)
+        df_tmp = get_lag_features(df_tmp, target_column, data_frequency)
+        df_tmp = get_time_and_cyclic_features(df_tmp, data_frequency)
         if job_type == "train":
-            df_tmp = encode_categorical_features(df_tmp, frequency)
+            df_tmp = DecodingUtils.encode_categorical_training_features(
+                df_tmp, data_frequency
+            )
         elif job_type == "predict":
-            df_tmp = decode_categorical_features(df_tmp, frequency)
-            df_tmp.dropna(subset=["device_id", "site_id", "device_category"], inplace=True)
+            df_tmp = DecodingUtils.decode_categorical_features_pred(
+                df_tmp, data_frequency
+            )
+            df_tmp.dropna(
+                subset=["device_id", "site_id", "device_category"], inplace=True
+            )  # only 1 row, not sure why
+
             df_tmp["device_id"] = df_tmp["device_id"].astype(int)
             df_tmp["site_id"] = df_tmp["site_id"].astype(int)
             df_tmp["device_category"] = df_tmp["device_category"].astype(int)
@@ -195,27 +228,27 @@ def get_time_and_cyclic_features(df, freq):
         return df_tmp
 
     @staticmethod
-    def train_and_save_forecast_models(train, frequency):
+    def train_and_save_forecast_models(training_data, frequency):
         """
         Perform the actual training for hourly data
         """
-        train["timestamp"] = pd.to_datetime(train["timestamp"])
-        features = [c for c in train.columns if c not in ["timestamp", "pm2_5"]]
+        training_data["timestamp"] = pd.to_datetime(training_data["timestamp"])
+        features = [c for c in training_data.columns if c not in ["timestamp", "pm2_5"]]
         print(features)
         target_col = "pm2_5"
         train_data = validation_data = test_data = pd.DataFrame()
-        for device in train["device_id"].unique():
-            device_df = train[train["device_id"] == device]
+        for device in training_data["device_id"].unique():
+            device_df = training_data[training_data["device_id"] == device]
             months = device_df["timestamp"].dt.month.unique()
             train_months = val_months = test_months = []
             if frequency == "hourly":
-                train_months = months[:4]
-                val_months = months[4:5]
-                test_months = months[5:]
+                train_months = months[:2]
+                val_months = months[2:3]
+                test_months = months[3:]
             elif frequency == "daily":
-                train_months = months[:8]
-                val_months = months[8:9]
-                test_months = months[9:]
+                train_months = months[:6]
+                val_months = months[6:7]
+                test_months = months[7:]
 
             train_df = device_df[device_df["timestamp"].dt.month.isin(train_months)]
             val_df = device_df[device_df["timestamp"].dt.month.isin(val_months)]
@@ -308,52 +341,103 @@ def objective(trial):
                 callbacks=[early_stopping(stopping_rounds=150)],
             )
 
-            upload_trained_model_to_gcs(
+            GCSUtils.upload_trained_model_to_gcs(
                 clf, project_id, bucket, f"{frequency}_forecast_model.pkl"
             )
 
-        alphas = [0.025, 0.975]
-        models = []
-        names = [
-            f"{frequency}_lower_quantile_model",
-            f"{frequency}_upper_quantile_model",
-        ]
-
-        for alpha in alphas:
-            clf = LGBMRegressor(
-                n_estimators=best_params["n_estimators"],
-                learning_rate=best_params["learning_rate"],
-                colsample_bytree=best_params["colsample_bytree"],
-                reg_alpha=best_params["reg_alpha"],
-                reg_lambda=best_params["reg_lambda"],
-                max_depth=best_params["max_depth"],
-                random_state=42,
-                verbosity=2,
-                objective="quantile",
-                alpha=alpha,
-                metric="quantile",
+        def create_error_df(data, target, preds):
+            error_df = pd.DataFrame(
+                {
+                    "actual_values": target,
+                    "predicted_values": preds,
+                }
             )
-            clf.fit(
-                train_data[features],
-                train_target,
-                eval_set=[(test_data[features], test_target)],
-                categorical_feature=["device_id", "site_id", "device_category"],
+            error_df["errors"] = (
+                error_df["predicted_values"] - error_df["actual_values"]
             )
-            models.append(clf)
-        for n, m in zip(names, models):
-            upload_trained_model_to_gcs(m, project_id, bucket, f"{n}.pkl")
+            error_df = pd.concat([error_df, data], axis=1)
+            error_df.drop(["actual_values", "pm2_5"], axis=1, inplace=True)
+            error_df.rename(columns={"predicted_values": "pm2_5"}, inplace=True)
+
+            return error_df
+
+        error_df1 = create_error_df(
+            train_data, train_target, clf.predict(train_data[features])
+        )
+        error_df2 = create_error_df(
+            test_data, test_target, clf.predict(test_data[features])
+        )
+
+        error_features1 = [c for c in error_df1.columns if c not in ["errors"]]
+        error_features2 = [c for c in error_df2.columns if c not in ["errors"]]
+
+        error_target1 = error_df1["errors"]
+        error_target2 = error_df2["errors"]
+
+        error_clf = LGBMRegressor(
+            n_estimators=31,
+            colsample_bytree=1,
+            learning_rate=0.1,
+            metric="rmse",
+            max_depth=5,
+            random_state=42,
+            verbosity=2,
+        )
+
+        error_clf.fit(
+            error_df1[error_features1],
+            error_target1,
+            eval_set=[(error_df2[error_features2], error_target2)],
+            categorical_feature=["device_id", "site_id", "device_category"],
+            callbacks=[early_stopping(stopping_rounds=150)],
+        )
+
+        GCSUtils.upload_trained_model_to_gcs(
+            error_clf, project_id, bucket, f"{frequency}_error_model.pkl"
+        )
+
+    # TODO: quantile regression approach
+    # alphas = [0.025, 0.975]
+    # models = []
+    # names = [
+    #     f"{frequency}_lower_quantile_model",
+    #     f"{frequency}_upper_quantile_model",
+    # ]
+    #
+    # for alpha in alphas:
+    #     clf = LGBMRegressor(
+    #         n_estimators=best_params["n_estimators"],
+    #         learning_rate=best_params["learning_rate"],
+    #         colsample_bytree=best_params["colsample_bytree"],
+    #         reg_alpha=best_params["reg_alpha"],
+    #         reg_lambda=best_params["reg_lambda"],
+    #         max_depth=best_params["max_depth"],
+    #         random_state=42,
+    #         verbosity=2,
+    #         objective="quantile",
+    #         alpha=alpha,
+    #         metric="quantile",
+    #     )
+    #     clf.fit(
+    #         train_data[features],
+    #         train_target,
+    #         eval_set=[(test_data[features], test_target)],
+    #         categorical_feature=["device_id", "site_id", "device_category"],
+    #     )
+    #     models.append(clf)
+    # for n, m in zip(names, models):
+    #     upload_trained_model_to_gcs(m, project_id, bucket, f"{n}.pkl")
 
     @staticmethod
     def generate_forecasts(data, project_name, bucket_name, frequency):
         data["timestamp"] = pd.to_datetime(data["timestamp"])
         data.columns = data.columns.str.strip()
-        data["pm2_5_lower"] = data["pm2_5_upper"] = data["margin_of_error"] = data["adjusted_forecast"] = 0
+        data["margin_of_error"] = data["adjusted_forecast"] = 0
 
         def get_forecasts(
             df_tmp,
             forecast_model,
-            lower_quantile_model,
-            upper_quantile_model,
+            error_model,
             frequency,
             horizon,
         ):
@@ -363,7 +447,7 @@ def get_forecasts(
                 df_tmp_no_ts = df_tmp.drop("timestamp", axis=1, inplace=False)
                 # daily frequency
                 if frequency == "daily":
-                    df_tmp.tail(1)['timestamp'] += timedelta(days=1)
+                    df_tmp.tail(1)["timestamp"] += timedelta(days=1)
                     shifts1 = [1, 2, 3, 7, 14]
                     for s in shifts1:
                         df_tmp[f"pm2_5_last_{s}_day"] = df_tmp.shift(s, axis=0)["pm2_5"]
@@ -372,32 +456,32 @@ def get_forecasts(
                     functions = ["mean", "std", "max", "min"]
                     for s in shifts2:
                         for f in functions:
-                            df_tmp[f"pm2_5_{f}_{s}_day"] = (df_tmp_no_ts.shift(1, axis=0).rolling(s).agg(f))[
-                                "pm2_5"
-                            ]
-
-                    print('done')
+                            df_tmp[f"pm2_5_{f}_{s}_day"] = (
+                                df_tmp_no_ts.shift(1, axis=0).rolling(s).agg(f)
+                            )["pm2_5"]
 
+                    print("done")
 
                 elif frequency == "hourly":
                     df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df_tmp.iloc[
-
                         -2, df_tmp.columns.get_loc("timestamp")
                     ] + pd.Timedelta(hours=1)
 
                     # lag features
                     shifts1 = [1, 2, 6, 12]
                     for s in shifts1:
-                        df_tmp[f"pm2_5_last_{s}_hour"] = df_tmp.shift(s, axis=0)["pm2_5"]
-
+                        df_tmp[f"pm2_5_last_{s}_hour"] = df_tmp.shift(s, axis=0)[
+                            "pm2_5"
+                        ]
 
                     # rolling features
                     shifts2 = [3, 6, 12, 24]
                     functions = ["mean", "std", "median", "skew"]
                     for s in shifts2:
                         for f in functions:
-                            df_tmp[f"pm2_5_{f}_{s}_hour"] = (df_tmp.shift(1, axis=0).rolling(s).agg(f))["pm2_5"]
-
+                            df_tmp[f"pm2_5_{f}_{s}_hour"] = (
+                                df_tmp.shift(1, axis=0).rolling(s).agg(f)
+                            )["pm2_5"]
 
                 attributes = ["year", "month", "day", "dayofweek"]
                 max_vals = [2023, 12, 30, 7]
@@ -405,40 +489,59 @@ def get_forecasts(
                     attributes.append("hour")
                     max_vals.append(23)
                 for a, m in zip(attributes, max_vals):
-                    df_tmp.tail(1)[f"{a}_sin"] = np.sin(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.__getattribute__(a) / m)
-                    df_tmp.tail(1)[f"{a}_cos"] = np.cos(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.__getattribute__(a) / m)
-                df_tmp.tail(1)["week_sin"] = np.sin(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.isocalendar().week / 52)
-                df_tmp.tail(1)["week_cos"] = np.cos(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.isocalendar().week / 52)
+                    df_tmp.tail(1)[f"{a}_sin"] = np.sin(
+                        2
+                        * np.pi
+                        * df_tmp.tail(1)["timestamp"].dt.__getattribute__(a)
+                        / m
+                    )
+                    df_tmp.tail(1)[f"{a}_cos"] = np.cos(
+                        2
+                        * np.pi
+                        * df_tmp.tail(1)["timestamp"].dt.__getattribute__(a)
+                        / m
+                    )
+                df_tmp.tail(1)["week_sin"] = np.sin(
+                    2 * np.pi * df_tmp.tail(1)["timestamp"].dt.isocalendar().week / 52
+                )
+                df_tmp.tail(1)["week_cos"] = np.cos(
+                    2 * np.pi * df_tmp.tail(1)["timestamp"].dt.isocalendar().week / 52
+                )
 
                 # make predictions
-                excluded_columns = ["pm2_5", "timestamp", "margin_of_error", "pm2_5_lower", "pm2_5_upper", "adjusted_forecast"]
+                excluded_columns = [
+                    "pm2_5",
+                    "timestamp",
+                    "margin_of_error",
+                    "adjusted_forecast",
+                ]
+                excluded_columns_2 = [
+                    "timestamp",
+                    "margin_of_error",
+                    "adjusted_forecast",
+                ]
                 print(df_tmp.tail(1))
-                # df_tmp.tail(1)['pm2_5'] = forecast_model.predict(df_tmp.tail(1).drop(excluded_columns).values.reshape(1, -1))
                 df_tmp.loc[df_tmp.index[-1], "pm2_5"] = forecast_model.predict(
                     df_tmp.drop(excluded_columns, axis=1).tail(1).values.reshape(1, -1)
                 )
-                df_tmp.loc[df_tmp.index[-1], "pm2_5_lower"] = lower_quantile_model.predict(
-                    df_tmp.drop(excluded_columns, axis=1).tail(1).values.reshape(1, -1)
+                df_tmp.loc[df_tmp.index[-1], "margin_of_error"] = error_model.predict(
+                    df_tmp.drop(excluded_columns_2, axis=1)
+                    .tail(1)
+                    .values.reshape(1, -1)
                 )
-                df_tmp.loc[df_tmp.index[-1], "pm2_5_upper"] = upper_quantile_model.predict(
-                    df_tmp.drop(excluded_columns, axis=1).tail(1).values.reshape(1, -1)
+                df_tmp.loc[df_tmp.index[-1], "adjusted_forecast"] = (
+                    df_tmp.loc[df_tmp.index[-1], "pm2_5"]
+                    + df_tmp.loc[df_tmp.index[-1], "margin_of_error"]
                 )
-                df_tmp.loc[df_tmp.index[-1], "margin_of_error"] = (df_tmp.loc[df_tmp.index[-1], "pm2_5_upper"] - df_tmp.loc[df_tmp.index[-1], "pm2_5_lower"])/2
-                df_tmp.loc[df_tmp.index[-1], "adjusted_forecast"] = df_tmp.loc[df_tmp.index[-1], "pm2_5"] + df_tmp.loc[df_tmp.index[-1], "margin_of_error"]
-
-
 
             return df_tmp.iloc[-int(horizon) :, :]
 
         forecasts = pd.DataFrame()
-        forecast_model = get_trained_model_from_gcs(
+        forecast_model = GCSUtils.get_trained_model_from_gcs(
             project_name, bucket_name, f"{frequency}_forecast_model.pkl"
         )
-        lower_quantile_model = get_trained_model_from_gcs(
-            project_name, bucket_name, f"{frequency}_lower_quantile_model.pkl"
-        )
-        upper_quantile_model = get_trained_model_from_gcs(
-            project_name, bucket_name, f"{frequency}_upper_quantile_model.pkl"
+        error_model = GCSUtils.get_trained_model_from_gcs(
+            project_name, bucket_name, f"{frequency}_error_model.pkl"
         )
 
         df_tmp = data.copy()
@@ -452,8 +555,7 @@ def get_forecasts(
             device_forecasts = get_forecasts(
                 test_copy,
                 forecast_model,
-                lower_quantile_model,
-                upper_quantile_model,
+                error_model,
                 frequency,
                 horizon,
             )
@@ -461,33 +563,46 @@ def get_forecasts(
             forecasts = pd.concat([forecasts, device_forecasts], ignore_index=True)
             print(device)
 
-
         forecasts["pm2_5"] = forecasts["pm2_5"].astype(float)
-        forecasts["pm2_5_lower"] = forecasts["pm2_5_lower"].astype(float)
-        forecasts["pm2_5_upper"] = forecasts["pm2_5_upper"].astype(float)
         forecasts["margin_of_error"] = forecasts["margin_of_error"].astype(float)
-        decode_categorical_features(forecasts, frequency)
+
+        DecodingUtils.decode_categorical_features_before_save(forecasts, frequency)
+        forecasts = forecasts[
+            [
+                "device_id",
+                "site_id",
+                "timestamp",
+                "pm2_5",
+                "margin_of_error",
+                "adjusted_forecast",
+            ]
+        ]
         return forecasts
 
     @staticmethod
     def save_forecasts_to_mongo(data, frequency):
-        timestamp = pd.to_datetime(datetime.now()).isoformat()
-        device_numbers = data["device_number"].unique()
+        device_ids = data["device_id"].unique()
+        created_at = pd.to_datetime(datetime.now()).isoformat()
         forecast_results = [
             {
-                field: data[data["device_number"] == i][field].tolist()[0]
-                if field != "pm2_5" and field != "time" and field != "health_tips"
-                else data[data["device_number"] == i][field].tolist()
+                field: data[data["device_id"] == i][field].tolist()[0]
+                if field
+                not in ["pm2_5", "margin_of_error", "adjusted_forecast", "timestamp"]
+                else data[data["device_id"] == i][field].tolist()
                 for field in data.columns
             }
-            | {"timestamp": timestamp}
-            for i in device_numbers
+            | {"created_at": created_at}
+            for i in device_ids
         ]
         client = pm.MongoClient(configuration.MONGO_URI)
         db = client[configuration.MONGO_DATABASE_NAME]
         if frequency == "hourly":
+            db.daily_forecasts.delete_many({})
             db.hourly_forecasts.insert_many(forecast_results)
+            print(db.hourly_forecasts.find_one())  # confirm saving has worked
         elif frequency == "daily":
+            db.daily_forecasts.delete_many({})
             db.daily_forecasts.insert_many(forecast_results)
+            print(db.daily_forecasts.find_one())
         else:
             raise ValueError("Invalid frequency argument")
diff --git a/src/airflow/dags/ml_training_jobs.py b/src/airflow/dags/ml_training_jobs.py
index 836441c52a..a517d6a3a7 100644
--- a/src/airflow/dags/ml_training_jobs.py
+++ b/src/airflow/dags/ml_training_jobs.py
@@ -1,10 +1,12 @@
 from airflow.decorators import dag, task
 
 from airqo_etl_utils.airflow_custom_utils import AirflowUtils
-from airqo_etl_utils.bigquery_api import BigQueryApi
 from airqo_etl_utils.config import configuration
-from airqo_etl_utils.date import date_to_str
 from airqo_etl_utils.ml_utils import ForecastUtils
+from airqo_etl_utils.date import date_to_str
+from dateutil.relativedelta import relativedelta
+from airqo_etl_utils.bigquery_api import BigQueryApi
+from datetime import datetime
 
 
 @dag(
@@ -18,9 +20,6 @@ def train_forecasting_models():
     # Hourly forecast tasks
     @task()
     def fetch_training_data_for_hourly_forecast_model():
-        from dateutil.relativedelta import relativedelta
-        from datetime import datetime
-
         current_date = datetime.today()
         start_date = current_date - relativedelta(
             months=int(configuration.HOURLY_FORECAST_TRAINING_JOB_SCOPE)
@@ -33,7 +32,7 @@ def preprocess_training_data_for_hourly_forecast_model(data):
         return ForecastUtils.preprocess_data(data, "hourly")
 
     @task()
-    def feature_engineer_training_data_for_hourly_forecast_model(data):
+    def feat_engineer_training_data_for_hourly_forecast_model(data):
         return ForecastUtils.feature_eng_data(data, "pm2_5", "hourly", "train")
 
     @task()
@@ -69,7 +68,7 @@ def train_and_save_daily_model(train_data):
 
     hourly_data = fetch_training_data_for_hourly_forecast_model()
     hourly_data = preprocess_training_data_for_hourly_forecast_model(hourly_data)
-    hourly_data = feature_engineer_training_data_for_hourly_forecast_model(hourly_data)
+    hourly_data = feat_engineer_training_data_for_hourly_forecast_model(hourly_data)
     train_and_save_hourly_forecast_model(hourly_data)
 
     daily_data = fetch_training_data_for_daily_forecast_model()
diff --git a/src/airflow/requirements.txt b/src/airflow/requirements.txt
index 7396bc939c..f5df386d68 100644
--- a/src/airflow/requirements.txt
+++ b/src/airflow/requirements.txt
@@ -7,7 +7,7 @@ kafka-python
 simplejson~=3.19.1
 sentry-sdk
 geopy
-mlflow~=2.5.0
+mlflow
 lightgbm~=4.0.0
 setuptools~=68.0.0
 urllib3~=1.26.16
diff --git a/src/predict/api/helpers.py b/src/predict/api/helpers.py
index baa1777d93..f455387d2b 100644
--- a/src/predict/api/helpers.py
+++ b/src/predict/api/helpers.py
@@ -186,7 +186,7 @@ def get_predictions_by_geo_coordinates_v2(latitude: float, longitude: float) ->
 @cache.memoize(timeout=Config.CACHE_TIMEOUT)
 def get_forecasts(
     db_name,
-        device_id=None,
+    device_id=None,
     site_id=None,
     site_name=None,
     parish=None,
@@ -218,13 +218,14 @@ def get_forecasts(
         for time, pm2_5, margin_of_error, adjusted_forecast in zip(
             site_forecasts[0]["timestamp"],
             site_forecasts[0]["pm2_5"],
-                site_forecasts[0]["margin_of_error"],
-                site_forecasts[0]["adjusted_forecast"],
+            site_forecasts[0]["margin_of_error"],
+            site_forecasts[0]["adjusted_forecast"],
         ):
             result = {
                 key: value
                 for key, value in zip(
-                    ["time", "pm2_5", "margin_of_error", "adjusted_forecast"], [time, pm2_5, margin_of_error, adjusted_forecast]
+                    ["time", "pm2_5", "margin_of_error", "adjusted_forecast"],
+                    [time, pm2_5, margin_of_error, adjusted_forecast],
                 )
             }
             results.append(result)
diff --git a/src/predict/api/prediction.py b/src/predict/api/prediction.py
index 068ae4ce48..a58bd2ae82 100644
--- a/src/predict/api/prediction.py
+++ b/src/predict/api/prediction.py
@@ -77,10 +77,6 @@ def get_next_24hr_forecasts():
     """
     Get forecasts for the next 24 hours from specified start time.
     """
-
-    """
-    Get forecasts for the next 1 week from specified start day.
-    """
     params = {
         name: request.args.get(name, default=None, type=str)
         for name in [
@@ -152,8 +148,8 @@ def get_next_1_week_forecasts():
             forecast["health_tips"] = list(
                 filter(
                     lambda x: x["aqi_category"]["max"]
-                              >= pm2_5
-                              >= x["aqi_category"]["min"],
+                    >= pm2_5
+                    >= x["aqi_category"]["min"],
                     health_tips,
                 )
             )

From 071d35120aa07992a6bc76c429d31e146602fc7e Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Thu, 31 Aug 2023 00:27:44 +0300
Subject: [PATCH 16/43] disable error functionality

---
 src/airflow/airqo_etl_utils/ml_utils.py | 195 ++++++++++++------------
 1 file changed, 98 insertions(+), 97 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index 591f72e133..630d936b90 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -99,6 +99,7 @@ def decode_categorical_features_before_save(df, frequency):
             df[col] = df[col].map({v: k for k, v in mapping.items()})
         return df
 
+    @staticmethod
     def encode_categorical_training_features(df, freq):
         df1 = df.copy()
         columns = ["device_id", "site_id", "device_category"]
@@ -205,10 +206,18 @@ def get_time_and_cyclic_features(df, freq):
             df1.drop(columns=attributes + ["week"], inplace=True)
             return df1
 
+        def get_location_cord(df):
+            df["x_cord"] = np.cos(df["latitude"]) * np.cos(df["longitude"])
+            df["y_cord"] = np.cos(df["latitude"]) * np.sin(df["longitude"])
+            df["z_cord"] = np.sin(df["latitude"])
+        
+            return df
+
         df_tmp = data.copy()
         df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"])
         df_tmp = get_lag_features(df_tmp, target_column, data_frequency)
         df_tmp = get_time_and_cyclic_features(df_tmp, data_frequency)
+        df_tmp = get_location_cord(df_tmp)
         if job_type == "train":
             df_tmp = DecodingUtils.encode_categorical_training_features(
                 df_tmp, data_frequency
@@ -233,23 +242,20 @@ def train_and_save_forecast_models(training_data, frequency):
         Perform the actual training for hourly data
         """
         training_data["timestamp"] = pd.to_datetime(training_data["timestamp"])
-        features = [c for c in training_data.columns if c not in ["timestamp", "pm2_5"]]
+        features = [
+            c
+            for c in training_data.columns
+            if c not in ["timestamp", "pm2_5", "latitude", "longitude"]
+        ]
         print(features)
         target_col = "pm2_5"
         train_data = validation_data = test_data = pd.DataFrame()
         for device in training_data["device_id"].unique():
             device_df = training_data[training_data["device_id"] == device]
             months = device_df["timestamp"].dt.month.unique()
-            train_months = val_months = test_months = []
-            if frequency == "hourly":
-                train_months = months[:2]
-                val_months = months[2:3]
-                test_months = months[3:]
-            elif frequency == "daily":
-                train_months = months[:6]
-                val_months = months[6:7]
-                test_months = months[7:]
-
+            train_months = months[:8]
+            val_months = months[8:9]
+            test_months = months[9:]
             train_df = device_df[device_df["timestamp"].dt.month.isin(train_months)]
             val_df = device_df[device_df["timestamp"].dt.month.isin(val_months)]
             test_df = device_df[device_df["timestamp"].dt.month.isin(test_months)]
@@ -345,56 +351,56 @@ def objective(trial):
                 clf, project_id, bucket, f"{frequency}_forecast_model.pkl"
             )
 
-        def create_error_df(data, target, preds):
-            error_df = pd.DataFrame(
-                {
-                    "actual_values": target,
-                    "predicted_values": preds,
-                }
-            )
-            error_df["errors"] = (
-                error_df["predicted_values"] - error_df["actual_values"]
-            )
-            error_df = pd.concat([error_df, data], axis=1)
-            error_df.drop(["actual_values", "pm2_5"], axis=1, inplace=True)
-            error_df.rename(columns={"predicted_values": "pm2_5"}, inplace=True)
-
-            return error_df
-
-        error_df1 = create_error_df(
-            train_data, train_target, clf.predict(train_data[features])
-        )
-        error_df2 = create_error_df(
-            test_data, test_target, clf.predict(test_data[features])
-        )
-
-        error_features1 = [c for c in error_df1.columns if c not in ["errors"]]
-        error_features2 = [c for c in error_df2.columns if c not in ["errors"]]
-
-        error_target1 = error_df1["errors"]
-        error_target2 = error_df2["errors"]
-
-        error_clf = LGBMRegressor(
-            n_estimators=31,
-            colsample_bytree=1,
-            learning_rate=0.1,
-            metric="rmse",
-            max_depth=5,
-            random_state=42,
-            verbosity=2,
-        )
-
-        error_clf.fit(
-            error_df1[error_features1],
-            error_target1,
-            eval_set=[(error_df2[error_features2], error_target2)],
-            categorical_feature=["device_id", "site_id", "device_category"],
-            callbacks=[early_stopping(stopping_rounds=150)],
-        )
-
-        GCSUtils.upload_trained_model_to_gcs(
-            error_clf, project_id, bucket, f"{frequency}_error_model.pkl"
-        )
+        # def create_error_df(data, target, preds):
+        #     error_df = pd.DataFrame(
+        #         {
+        #             "actual_values": target,
+        #             "predicted_values": preds,
+        #         }
+        #     )
+        #     error_df["errors"] = (
+        #         error_df["predicted_values"] - error_df["actual_values"]
+        #     )
+        #     error_df = pd.concat([error_df, data], axis=1)
+        #     error_df.drop(["actual_values", "pm2_5"], axis=1, inplace=True)
+        #     error_df.rename(columns={"predicted_values": "pm2_5"}, inplace=True)
+        #
+        #     return error_df
+        #
+        # error_df1 = create_error_df(
+        #     train_data, train_target, clf.predict(train_data[features])
+        # )
+        # error_df2 = create_error_df(
+        #     test_data, test_target, clf.predict(test_data[features])
+        # )
+        #
+        # error_features1 = [c for c in error_df1.columns if c not in ["errors"]]
+        # error_features2 = [c for c in error_df2.columns if c not in ["errors"]]
+        #
+        # error_target1 = error_df1["errors"]
+        # error_target2 = error_df2["errors"]
+        #
+        # error_clf = LGBMRegressor(
+        #     n_estimators=31,
+        #     colsample_bytree=1,
+        #     learning_rate=0.1,
+        #     metric="rmse",
+        #     max_depth=5,
+        #     random_state=42,
+        #     verbosity=2,
+        # )
+        #
+        # error_clf.fit(
+        #     error_df1[error_features1],
+        #     error_target1,
+        #     eval_set=[(error_df2[error_features2], error_target2)],
+        #     categorical_feature=["device_id", "site_id", "device_category"],
+        #     callbacks=[early_stopping(stopping_rounds=150)],
+        # )
+        #
+        # GCSUtils.upload_trained_model_to_gcs(
+        #     error_clf, project_id, bucket, f"{frequency}_error_model.pkl"
+        # )
 
     # TODO: quantile regression approach
     # alphas = [0.025, 0.975]
@@ -432,12 +438,11 @@ def create_error_df(data, target, preds):
     def generate_forecasts(data, project_name, bucket_name, frequency):
         data["timestamp"] = pd.to_datetime(data["timestamp"])
         data.columns = data.columns.str.strip()
-        data["margin_of_error"] = data["adjusted_forecast"] = 0
+        # data["margin_of_error"] = data["adjusted_forecast"] = 0
 
         def get_forecasts(
             df_tmp,
             forecast_model,
-            error_model,
             frequency,
             horizon,
         ):
@@ -460,8 +465,6 @@ def get_forecasts(
                                 df_tmp_no_ts.shift(1, axis=0).rolling(s).agg(f)
                             )["pm2_5"]
 
-                    print("done")
-
                 elif frequency == "hourly":
                     df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df_tmp.iloc[
                         -2, df_tmp.columns.get_loc("timestamp")
@@ -480,7 +483,7 @@ def get_forecasts(
                     for s in shifts2:
                         for f in functions:
                             df_tmp[f"pm2_5_{f}_{s}_hour"] = (
-                                df_tmp.shift(1, axis=0).rolling(s).agg(f)
+                                df_tmp_no_ts.shift(1, axis=0).rolling(s).agg(f)
                             )["pm2_5"]
 
                 attributes = ["year", "month", "day", "dayofweek"]
@@ -508,41 +511,41 @@ def get_forecasts(
                     2 * np.pi * df_tmp.tail(1)["timestamp"].dt.isocalendar().week / 52
                 )
 
-                # make predictions
                 excluded_columns = [
                     "pm2_5",
                     "timestamp",
-                    "margin_of_error",
-                    "adjusted_forecast",
+                    "latitude",
+                    "longitude",
+                    # "margin_of_error",
+                    # "adjusted_forecast",
                 ]
-                excluded_columns_2 = [
-                    "timestamp",
-                    "margin_of_error",
-                    "adjusted_forecast",
-                ]
-                print(df_tmp.tail(1))
+                # excluded_columns_2 = [
+                #     "timestamp",
+                #     "margin_of_error",
+                #     "adjusted_forecast",
+                # ]
                 df_tmp.loc[df_tmp.index[-1], "pm2_5"] = forecast_model.predict(
                     df_tmp.drop(excluded_columns, axis=1).tail(1).values.reshape(1, -1)
                 )
-                df_tmp.loc[df_tmp.index[-1], "margin_of_error"] = error_model.predict(
-                    df_tmp.drop(excluded_columns_2, axis=1)
-                    .tail(1)
-                    .values.reshape(1, -1)
-                )
-                df_tmp.loc[df_tmp.index[-1], "adjusted_forecast"] = (
-                    df_tmp.loc[df_tmp.index[-1], "pm2_5"]
-                    + df_tmp.loc[df_tmp.index[-1], "margin_of_error"]
-                )
-
-            return df_tmp.iloc[-int(horizon) :, :]
+                # df_tmp.loc[df_tmp.index[-1], "margin_of_error"] = error_model.predict(
+                #     df_tmp.drop(excluded_columns_2, axis=1)
+                #     .tail(1)
+                #     .values.reshape(1, -1)
+                # )
+                # df_tmp.loc[df_tmp.index[-1], "adjusted_forecast"] = (
+                #     df_tmp.loc[df_tmp.index[-1], "pm2_5"]
+                #     + df_tmp.loc[df_tmp.index[-1], "margin_of_error"]
+                # )
+
+            return df_tmp.iloc[-int(horizon):, :]
 
         forecasts = pd.DataFrame()
         forecast_model = GCSUtils.get_trained_model_from_gcs(
             project_name, bucket_name, f"{frequency}_forecast_model.pkl"
         )
-        error_model = GCSUtils.get_trained_model_from_gcs(
-            project_name, bucket_name, f"{frequency}_error_model.pkl"
-        )
+        # error_model = GCSUtils.get_trained_model_from_gcs(
+        #     project_name, bucket_name, f"{frequency}_error_model.pkl"
+        # )
 
         df_tmp = data.copy()
         for device in df_tmp["device_id"].unique():
@@ -555,7 +558,6 @@ def get_forecasts(
             device_forecasts = get_forecasts(
                 test_copy,
                 forecast_model,
-                error_model,
                 frequency,
                 horizon,
             )
@@ -564,7 +566,7 @@ def get_forecasts(
             print(device)
 
         forecasts["pm2_5"] = forecasts["pm2_5"].astype(float)
-        forecasts["margin_of_error"] = forecasts["margin_of_error"].astype(float)
+        # forecasts["margin_of_error"] = forecasts["margin_of_error"].astype(float)
 
         DecodingUtils.decode_categorical_features_before_save(forecasts, frequency)
         forecasts = forecasts[
@@ -573,8 +575,8 @@ def get_forecasts(
                 "site_id",
                 "timestamp",
                 "pm2_5",
-                "margin_of_error",
-                "adjusted_forecast",
+                # "margin_of_error",
+                # "adjusted_forecast",
             ]
         ]
         return forecasts
@@ -586,8 +588,7 @@ def save_forecasts_to_mongo(data, frequency):
         forecast_results = [
             {
                 field: data[data["device_id"] == i][field].tolist()[0]
-                if field
-                not in ["pm2_5", "margin_of_error", "adjusted_forecast", "timestamp"]
+                if field not in ["pm2_5", "timestamp"]
                 else data[data["device_id"] == i][field].tolist()
                 for field in data.columns
             }
@@ -597,12 +598,12 @@ def save_forecasts_to_mongo(data, frequency):
         client = pm.MongoClient(configuration.MONGO_URI)
         db = client[configuration.MONGO_DATABASE_NAME]
         if frequency == "hourly":
-            db.daily_forecasts.delete_many({})
+            db.hourly_forecasts.delete_many({})
             db.hourly_forecasts.insert_many(forecast_results)
             print(db.hourly_forecasts.find_one())  # confirm saving has worked
         elif frequency == "daily":
             db.daily_forecasts.delete_many({})
             db.daily_forecasts.insert_many(forecast_results)
-            print(db.daily_forecasts.find_one())
+            print(db.daily_forecasts_1.find_one())
         else:
             raise ValueError("Invalid frequency argument")

From fc2b8dddeb0246e3b23936476eb62343e84f17a8 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Wed, 6 Sep 2023 18:50:52 +0000
Subject: [PATCH 17/43] add bigquery method tests

---
 src/airflow/airqo_etl_utils/bigquery_api.py   |  13 +-
 .../tests/big_query_api_tests.py              | 150 ++++++++++++------
 2 files changed, 110 insertions(+), 53 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/bigquery_api.py b/src/airflow/airqo_etl_utils/bigquery_api.py
index 1d720b6772..65317a0e77 100644
--- a/src/airflow/airqo_etl_utils/bigquery_api.py
+++ b/src/airflow/airqo_etl_utils/bigquery_api.py
@@ -619,6 +619,10 @@ def fetch_data(
         self,
         start_date_time: str,
     ) -> pd.DataFrame:
+        try:
+            pd.to_datetime(start_date_time)
+        except ValueError:
+            raise ValueError(f"Invalid start date time: {start_date_time}")
         query = f"""
         SELECT DISTINCT 
             t1.device_id, 
@@ -637,8 +641,13 @@ def fetch_data(
         job_config = bigquery.QueryJobConfig()
         job_config.use_query_cache = True
 
-        df = self.client.query(f"{query}", job_config).result().to_dataframe()
-        return df
+        try:
+            df = self.client.query(query, job_config).result().to_dataframe()
+            return df
+        except Exception as e:
+            print("Error fetching data from bigquery")
+
+        
 
     @staticmethod
     def save_forecasts_to_bigquery(df, table):
diff --git a/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py b/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py
index e519ad033d..a2d00f4b71 100644
--- a/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py
+++ b/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py
@@ -1,4 +1,6 @@
 # Import pytest and other modules as needed
+from unittest import mock
+
 import pandas as pd
 import pytest
 
@@ -6,7 +8,7 @@
 
 
 @pytest.fixture
-def mock_bigquery_client(mocker):
+def mock_bigquery_client1(mocker):
     mock_client = mocker.Mock()
     mock_client.query.return_value.result.return_value.to_dataframe.return_value = (
         pd.DataFrame(
@@ -21,66 +23,112 @@ def mock_bigquery_client(mocker):
     return mock_client
 
 
+@pytest.fixture
+def mock_bigquery_client2():
+    """A fixture that mocks the bigquery.Client object."""
+
+    fake_client = mock.Mock()
+
+    sample_df = pd.DataFrame(
+        {
+            "device_id": ["A", "A", "B", "B"],
+            "timestamp": [
+                "2023-01-01 00:00:00",
+                "2023-01-01 01:00:00",
+                "2023-01-01 00:00:00",
+                "2023-01-01 01:00:00",
+            ],
+            "site_id": [1, 1, 2, 2],
+            "pm2_5": [10.0, 12.0, 15.0, 18.0],
+            "latitude": [10.0, 10.0, 20.0, 20.0],
+            "longitude": [10.0, 10.0, 20.0, 20.0],
+            "device_category": ["A", "A", "B", "B"],
+        }
+    )
+
+    fake_data_empty_result = pd.DataFrame()
+
+    fake_error = "Fake error"
+
+    def fake_query(query, job_config):
+        fake_job = mock.Mock()
+
+        if "2023-01-01" in query:
+            fake_job.result.return_value.to_dataframe.return_value = (
+                sample_df
+            )
+        elif "2023-01-02" in query:
+            fake_job.result.return_value.to_dataframe.return_value = (
+                fake_data_empty_result
+            )
+        elif "2023-01-03" in query:
+            fake_job.result.side_effect = fake_error
+        else:
+            raise ValueError("Invalid date")
+
+        return fake_job
+
+    fake_client.query.side_effect = fake_query
+
+    return fake_client
+
 @pytest.mark.parametrize(
-    "method",
+    "start_date_time, expected_df",
     [
-        BigQueryApi.fetch_hourly_forecast_training_data,
-        BigQueryApi.fetch_daily_forecast_training_data,
+        (
+            "2023-01-01",
+            pd.DataFrame(
+                {
+                    "device_id": ["A", "A", "B", "B"],
+                    "timestamp": [
+                        "2023-01-01 00:00:00",
+                        "2023-01-01 01:00:00",
+                        "2023-01-01 00:00:00",
+                        "2023-01-01 01:00:00",
+                    ],
+                    "site_id": [1, 1, 2, 2],
+                    "pm2_5": [10.0, 12.0, 15.0, 18.0],
+                    "latitude": [10.0, 10.0, 20.0, 20.0],
+                    "longitude": [10.0, 10.0, 20.0, 20.0],
+                    "device_category": ["A", "A", "B", "B"],
+                }
+            ),
+        ),
+        ("2023-01-02", pd.DataFrame()),
     ],
 )
-def test_fetch_data_columns(method, mock_bigquery_client):
-    api = BigQueryApi()
-    api.client = mock_bigquery_client
-    df = method(api)
-    assert list(df.columns) == ["created_at", "device_number", "pm2_5"]
-    assert isinstance(df, pd.DataFrame)
-    assert not df.empty
+def test_fetch_data_correct_se(mock_bigquery_client2, start_date_time, expected_df):
 
+    """Tests the fetch_data method for the happy path scenarios."""
 
-def test_fetch_hourly_forecast_training_data_exception(mock_bigquery_client):
-    api = BigQueryApi()
-    api.client = mock_bigquery_client
-    api.client.query.side_effect = Exception("Bigquery error")
-    with pytest.raises(Exception) as e:
-        df = api.fetch_hourly_forecast_training_data()
-    assert "Bigquery error" in str(e.value)
+    bq_api = BigQueryApi()
+    bq_api.client = mock_bigquery_client2
 
+    actual_df = bq_api.fetch_data(start_date_time)
+    pd.testing.assert_frame_equal(actual_df, expected_df)
 
-def test_fetch_hourly_forecast_training_data_null():
-    api = BigQueryApi()
-    api.client = mock_bigquery_client()
-    api.client.query.return_value.result.return_value.to_dataframe.return_value = (
-        pd.DataFrame(
-            {
-                "created_at": ["2021-01-01 00:00:00", "2021-01-01 01:00:00"],
-                "device_number": [1, 2],
-                "pm2_5": [None, None],
-            }
-        )
-    )
-    with pytest.raises(Exception) as e:
-        df = api.fetch_hourly_forecast_training_data()
-    assert "pm2_5 column cannot be null" in str(e.value)
 
+@pytest.mark.parametrize("start_date_time", ["2023-13-01", "2023-01-32", "invalid"])
+def test_fetch_data_invalid_date(mock_bigquery_client2, start_date_time):
+    """Tests the fetch_data method for the scenario where an invalid date string is passed."""
+
+    bq_api = BigQueryApi()
+    bq_api.client = mock_bigquery_client2
+
+    with pytest.raises(ValueError):
+        bq_api.fetch_data(start_date_time)
+
+@pytest.mark.parametrize("start_date_time", ["2023-01-03"])
+def test_fetch_data_bigquery_error(mock_bigquery_client2, start_date_time):
+    """Tests the fetch_data method for the scenario where a bigquery.GoogleAPIError is raised."""
+
+    # Create an instance of BigQueryApi with the mocked client
+    bq_api = BigQueryApi()
+    bq_api.client = mock_bigquery_client2
+
+    with pytest.raises(Exception):
+        bq_api.fetch_data(start_date_time)
 
-def test_fetch_daily_forecast_training_data_date_range(mock_bigquery_client):
-    api = BigQueryApi()
-    api.client = mock_bigquery_client
-    api.client.query.return_value.result.return_value.to_dataframe.return_value = (
-        pd.DataFrame(
-            {
-                "created_at": [
-                    "2020-01-01 00:00:00",
-                    "2020-06-01 00:00:00",
-                    "2020-12-01 00:00:00",
-                ],
-                "device_number": [1, 2, 3],
-                "pm2_5": [10, 20, 30],
-            }
-        )
-    )
-    df = api.fetch_daily_forecast_training_data()
-    assert df["created_at"].min() >= pd.Timestamp.now() - pd.DateOffset(months=12)
 
 
 def test_fetch_raw_readings_empty(mock_bigquery_client):

From 545be08bec375619a500b5b096608ab534b96369 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Thu, 14 Sep 2023 11:17:10 +0300
Subject: [PATCH 18/43] add preprocessing tests

---
 src/airflow/airqo_etl_utils/ml_utils.py       | 14 +++-
 .../tests/airqo_utils_tests.py                |  1 -
 .../tests/big_query_api_tests.py              |  2 +-
 src/airflow/airqo_etl_utils/tests/conftest.py | 78 +++----------------
 .../airqo_etl_utils/tests/ml_utils_tests.py   | 44 ++++-------
 5 files changed, 40 insertions(+), 99 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index 630d936b90..095a3caf0d 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -14,7 +14,6 @@
 
 from .config import configuration
 
-fixed_columns = ["site_id"]
 project_id = configuration.GOOGLE_CLOUD_PROJECT_ID
 bucket = configuration.FORECAST_MODELS_BUCKET
 environment = configuration.ENVIRONMENT
@@ -126,7 +125,18 @@ def encode_categorical_training_features(df, freq):
 class ForecastUtils:
     @staticmethod
     def preprocess_data(data, data_frequency):
-        data["timestamp"] = pd.to_datetime(data["timestamp"])
+        required_columns = {"device_id", "site_id", "device_category", "pm2_5", "timestamp"}
+        if not required_columns.issubset(data.columns):
+            missing_columns = required_columns.difference(data.columns)
+            raise ValueError(
+                f"Provided dataframe missing necessary columns: {', '.join(missing_columns)}"
+            )
+        try:
+            data["timestamp"] = pd.to_datetime(data["timestamp"])
+        except ValueError as e:
+            raise ValueError(
+                "datetime conversion error, please provide timestamp in valid format"
+            )
         data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])[
             "pm2_5"
         ].transform(lambda x: x.interpolate(method="linear", limit_direction="both"))
diff --git a/src/airflow/airqo_etl_utils/tests/airqo_utils_tests.py b/src/airflow/airqo_etl_utils/tests/airqo_utils_tests.py
index 6e3792f74e..0d1541df67 100644
--- a/src/airflow/airqo_etl_utils/tests/airqo_utils_tests.py
+++ b/src/airflow/airqo_etl_utils/tests/airqo_utils_tests.py
@@ -10,7 +10,6 @@
 from airqo_etl_utils.tests.conftest import FaultDetectionFixtures
 
 
-# TODO: Convert to pytest
 class TestAirQoDataUtils(unittest.TestCase):
     def test_map_site_ids_to_historical_data(self):
         logs = pd.DataFrame(
diff --git a/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py b/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py
index a2d00f4b71..da4a20e4f4 100644
--- a/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py
+++ b/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py
@@ -99,7 +99,7 @@ def fake_query(query, job_config):
 )
 def test_fetch_data_correct_se(mock_bigquery_client2, start_date_time, expected_df):
 
-    """Tests the fetch_data method for the happy path scenarios."""
+    """Tests the fetch_data method for scenarios when correct data is retrieved."""
 
     bq_api = BigQueryApi()
     bq_api.client = mock_bigquery_client2
diff --git a/src/airflow/airqo_etl_utils/tests/conftest.py b/src/airflow/airqo_etl_utils/tests/conftest.py
index 8ec08842b2..e63ec4dab8 100644
--- a/src/airflow/airqo_etl_utils/tests/conftest.py
+++ b/src/airflow/airqo_etl_utils/tests/conftest.py
@@ -1,7 +1,7 @@
-import numpy as np
+from datetime import datetime
+
 import pandas as pd
 import pytest
-from datetime import datetime
 
 
 def pytest_configure(config):
@@ -13,75 +13,17 @@ def pytest_configure(config):
 class ForecastFixtures:
     @staticmethod
     @pytest.fixture(scope="session")
-    def hourly_data():
-        return pd.DataFrame(
-            {
-                "device_number": [1, 1, 1, 2, 2, 2],
-                "created_at": [
-                    "2021-08-01 00:00:00",
-                    "2021-08-01 01:00:00",
-                    "2021-08-01 02:00:00",
-                    "2021-08-01 00:00:00",
-                    "2021-08-01 01:00:00",
-                    "2021-08-01 02:00:00",
-                ],
-                "pm2_5": [10.0, np.nan, 12.0, 15.0, np.nan, np.nan],
-            }
-        )
-
-    @staticmethod
-    @pytest.fixture(scope="session")
-    def daily_data():
-        return pd.DataFrame(
-            {
-                "device_number": [1, 1, 1, 2, 2, 2],
-                "created_at": [
-                    "2021-08-01 00:00:00",
-                    "2021-08-02 00:00:00",
-                    "2021-08-03 00:00:00",
-                    "2021-08-01 00:00:00",
-                    "2021-08-02 00:00:00",
-                    "2021-08-03 00:00:00",
-                ],
-                "pm2_5": [10.0, np.nan, 12.0, 15.0, np.nan, np.nan],
-            }
-        )
-
-    @staticmethod
-    @pytest.fixture(scope="session")
-    def hourly_output():
-        return pd.DataFrame(
-            {
-                "device_number": [1, 1, 1, 2, 2, 2],
-                "created_at": [
-                    "2021-08-01 00:00:00",
-                    "2021-08-01 01:00:00",
-                    "2021-08-01 02:00:00",
-                    "2021-08-01 00:00:00",
-                    "2021-08-01 01:00:00",
-                    "2021-08-01 02:00:00",
-                ],
-                "pm2_5": [10.0, 11.0, 12.0, 15.0, 16.0, 17.0],
-            }
-        )
-
-    @staticmethod
-    @pytest.fixture(scope="session")
-    def daily_output():
-        return pd.DataFrame(
+    def example_data():
+        data = pd.DataFrame(
             {
-                "device_number": [1, 1, 1, 2, 2, 2],
-                "created_at": [
-                    "2021-08-01 00:00:00",
-                    "2021-08-02 00:00:00",
-                    "2021-08-03 00:00:00",
-                    "2021-08-01 00:00:00",
-                    "2021-08-02 00:00:00",
-                    "2021-08-03 00:00:00",
-                ],
-                "pm2_5": [10.0, 11.0, 12.0, 15.0, 16.0, 17.0],
+                "device_id": ["A", "B"],
+                "site_id": ["X", "Y"],
+                "device_category": ["LOWCOST", "BAM"],
+                "pm2_5": [1, 2],
+                "timestamp": ["2023-01-01", "2023-02-01"],
             }
         )
+        return data
 
 
 @pytest.fixture(scope="session")
diff --git a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py
index e28e592cea..0e707711c4 100644
--- a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py
+++ b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py
@@ -1,32 +1,22 @@
-# TODO: Add tests for ml_utils.py
+import pytest
 
-import pandas as pd
-
-from airqo_etl_utils.ml_utils import ForecastUtils
+from airqo_etl_utils.ml_utils import ForecastUtils as FUtils
 from airqo_etl_utils.tests.conftest import ForecastFixtures
 
 
-class ForecastTests(ForecastFixtures):
-    def test_preprocess_hourly_training_data(self, hourly_data, hourly_output):
-        assert isinstance(
-            ForecastUtils.preprocess_hourly_training_data(hourly_data), pd.DataFrame
-        )
-        assert (
-            ForecastUtils.preprocess_hourly_training_data(hourly_data).shape[0]
-            == hourly_output.shape[0]
-        )
-        assert ForecastUtils.preprocess_hourly_training_data(hourly_data)[
-            "pm2_5"
-        ].equals(hourly_output["pm2_5"])
+class TestsForecasts(ForecastFixtures):
+    def test_preprocess_data_typical_case(self, example_data):
+        result = FUtils.preprocess_data(example_data, "daily")
+        assert "pm2_5" in result.columns
+
+    def test_preprocess_data_invalid_input(self, example_data):
+        df = example_data.drop(columns=["device_id"])
+        with pytest.raises(ValueError):
+            FUtils.preprocess_data(df, "daily")
 
-    def test_preprocess_daily_training_data(self, daily_data, daily_output):
-        assert isinstance(
-            ForecastUtils.preprocess_daily_training_data(daily_data), pd.DataFrame
-        )
-        assert (
-            ForecastUtils.preprocess_daily_training_data(daily_data).shape[0]
-            == daily_output.shape[0]
-        )
-        assert ForecastUtils.preprocess_daily_training_data(daily_data)["pm2_5"].equals(
-            daily_output["pm2_5"]
-        )
+    def test_preprocess_data_invalid_timestamp(self, example_data):
+        # Invalid timestamp
+        df = example_data.copy()
+        df["timestamp"] = "invalid"
+        with pytest.raises(ValueError):
+            FUtils.preprocess_data(df, "daily")
\ No newline at end of file

From 15909c150468ca7844235a0f2e0748ba6ca2acf5 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Thu, 14 Sep 2023 22:00:02 +0300
Subject: [PATCH 19/43] add feature engineering tests

---
 src/airflow/airqo_etl_utils/ml_utils.py       | 187 ++++++++++--------
 src/airflow/airqo_etl_utils/tests/conftest.py |  24 ++-
 .../airqo_etl_utils/tests/ml_utils_tests.py   |  83 +++++++-
 src/airflow/dags/ml_training_jobs.py          |  49 ++++-
 4 files changed, 239 insertions(+), 104 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index 095a3caf0d..f35116c014 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -100,6 +100,7 @@ def decode_categorical_features_before_save(df, frequency):
 
     @staticmethod
     def encode_categorical_training_features(df, freq):
+        df["timestamp"] = pd.to_datetime("timestamp")
         df1 = df.copy()
         columns = ["device_id", "site_id", "device_category"]
         mappings = []
@@ -155,102 +156,118 @@ def preprocess_data(data, data_frequency):
         data = data.dropna(subset=["pm2_5"])
         return data
 
+
     @staticmethod
-    def feature_eng_data(data, target_column, data_frequency, job_type):
-        def get_lag_features(df, target_col, freq):
-            df1 = df.copy()  # use copy to prevent terminal warning
-            if freq == "daily":
-                shifts = [1, 2, 3, 7, 14]
-                for s in shifts:
-                    df1[f"pm2_5_last_{s}_day"] = df1.groupby(["device_id"])[
-                        target_col
-                    ].shift(s)
-
-                shifts = [2, 3, 7, 14]
-                functions = ["mean", "std", "max", "min"]
-                for s in shifts:
-                    for f in functions:
-                        df1[f"pm2_5_{f}_{s}_day"] = (
-                            df1.groupby(["device_id"])[target_col]
-                            .shift(1)
-                            .rolling(s)
-                            .agg(f)
-                        )
-            elif freq == "hourly":
-                shifts = [1, 2, 6, 12]
-                for s in shifts:
-                    df1[f"pm2_5_last_{s}_hour"] = df1.groupby(["device_id"])[
-                        target_col
-                    ].shift(s)
-
-                shifts = [3, 6, 12, 24]
-                functions = ["mean", "std", "median", "skew"]
-                for s in shifts:
-                    for f in functions:
-                        df1[f"pm2_5_{f}_{s}_hour"] = (
-                            df1.groupby(["device_id"])[target_col]
-                            .shift(1)
-                            .rolling(s)
-                            .agg(f)
-                        )
-            else:
-                raise ValueError("Invalid frequency")
-
-            return df1
-
-        def get_time_and_cyclic_features(df, freq):
-            df1 = df.copy()
-            attributes = ["year", "month", "day", "dayofweek"]
-            max_vals = [2023, 12, 30, 7]
-            if freq == "hourly":
-                attributes.append("hour")
-                max_vals.append(23)
-            for a, m in zip(attributes, max_vals):
-                df1[a] = df1["timestamp"].dt.__getattribute__(a)
-                df1[a + "_sin"] = np.sin(2 * np.pi * df1[a] / m)
-                df1[a + "_cos"] = np.cos(2 * np.pi * df1[a] / m)
-
-            df1["week"] = df1["timestamp"].dt.isocalendar().week
-            df1["week_sin"] = np.sin(2 * np.pi * df1["week"] / 52)
-            df1["week_cos"] = np.cos(2 * np.pi * df1["week"] / 52)
-            df1.drop(columns=attributes + ["week"], inplace=True)
-            return df1
-
-        def get_location_cord(df):
-            df["x_cord"] = np.cos(df["latitude"]) * np.cos(df["longitude"])
-            df["y_cord"] = np.cos(df["latitude"]) * np.sin(df["longitude"])
-            df["z_cord"] = np.sin(df["latitude"])
+    def get_lag_and_roll_features(df, target_col, freq):
+        if df.empty:
+            raise ValueError("Empty dataframe provided")
+
+        if target_col not in df.columns or "timestamp" not in df.columns or "device_id" not in df.columns:
+            raise ValueError("Required columns missing")
+
+        df["timestamp"] = pd.to_datetime(df["timestamp"])
         
-            return df
+        df1 = df.copy()  # use copy to prevent terminal warning
+        if freq == "daily":
+            shifts = [1, 2, 3, 7, 14]
+            for s in shifts:
+                df1[f"pm2_5_last_{s}_day"] = df1.groupby(["device_id"])[
+                    target_col
+                ].shift(s)
+            shifts = [2, 3, 7, 14]
+            functions = ["mean", "std", "max", "min"]
+            for s in shifts:
+                for f in functions:
+                    df1[f"pm2_5_{f}_{s}_day"] = (
+                        df1.groupby(["device_id"])[target_col]
+                        .shift(1)
+                        .rolling(s)
+                        .agg(f)
+                    )
+        elif freq == "hourly":
+            shifts = [1, 2, 6, 12]
+            for s in shifts:
+                df1[f"pm2_5_last_{s}_hour"] = df1.groupby(["device_id"])[
+                    target_col
+                ].shift(s)
+            shifts = [3, 6, 12, 24]
+            functions = ["mean", "std", "median", "skew"]
+            for s in shifts:
+                for f in functions:
+                    df1[f"pm2_5_{f}_{s}_hour"] = (
+                        df1.groupby(["device_id"])[target_col]
+                        .shift(1)
+                        .rolling(s)
+                        .agg(f)
+                    )
+        else:
+            raise ValueError("Invalid frequency")
+        return df1
 
-        df_tmp = data.copy()
-        df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"])
-        df_tmp = get_lag_features(df_tmp, target_column, data_frequency)
-        df_tmp = get_time_and_cyclic_features(df_tmp, data_frequency)
-        df_tmp = get_location_cord(df_tmp)
-        if job_type == "train":
-            df_tmp = DecodingUtils.encode_categorical_training_features(
-                df_tmp, data_frequency
-            )
-        elif job_type == "predict":
-            df_tmp = DecodingUtils.decode_categorical_features_pred(
-                df_tmp, data_frequency
-            )
-            df_tmp.dropna(
-                subset=["device_id", "site_id", "device_category"], inplace=True
-            )  # only 1 row, not sure why
+    @staticmethod
+    def get_time_and_cyclic_features(df, freq):
 
-            df_tmp["device_id"] = df_tmp["device_id"].astype(int)
-            df_tmp["site_id"] = df_tmp["site_id"].astype(int)
-            df_tmp["device_category"] = df_tmp["device_category"].astype(int)
+        df["timestamp"] = pd.to_datetime(df["timestamp"])
+        df1 = df.copy()
+        attributes = ["year", "month", "day", "dayofweek"]
+        max_vals = [2023, 12, 30, 7]
+        if freq == "hourly":
+            attributes.append("hour")
+            max_vals.append(23)
+        for a, m in zip(attributes, max_vals):
+            df1[a] = df1["timestamp"].dt.__getattribute__(a)
+            df1[a + "_sin"] = np.sin(2 * np.pi * df1[a] / m)
+            df1[a + "_cos"] = np.cos(2 * np.pi * df1[a] / m)
+    
+        df1["week"] = df1["timestamp"].dt.isocalendar().week
+        df1["week_sin"] = np.sin(2 * np.pi * df1["week"] / 52)
+        df1["week_cos"] = np.cos(2 * np.pi * df1["week"] / 52)
+        df1.drop(columns=attributes + ["week"], inplace=True)
+        return df1
+
+    @staticmethod
+    def get_location_features(df):
+        df["timestamp"] = pd.to_datetime(df)
+        df["x_cord"] = np.cos(df["latitude"]) * np.cos(df["longitude"])
+        df["y_cord"] = np.cos(df["latitude"]) * np.sin(df["longitude"])
+        df["z_cord"] = np.sin(df["latitude"])
+    
+        return df
 
-        return df_tmp
+    #     df_tmp = get_lag_features(df_tmp, target_column, data_frequency)
+    #     df_tmp = get_time_and_cyclic_features(df_tmp, data_frequency)
+    #     df_tmp = get_location_cord(df_tmp)
+    #     if job_type == "train":
+    #         df_tmp = DecodingUtils.encode_categorical_training_features(
+    #             df_tmp, data_frequency
+    #         )
+    #     elif job_type == "predict":
+    #         df_tmp = DecodingUtils.decode_categorical_features_pred(
+    #             df_tmp, data_frequency
+    #         )
+    #         df_tmp.dropna(
+    #             subset=["device_id", "site_id", "device_category"], inplace=True
+    #         )  # only 1 row, not sure why
+    #
+    #         df_tmp["device_id"] = df_tmp["device_id"].astype(int)
+    #         df_tmp["site_id"] = df_tmp["site_id"].astype(int)
+    #         df_tmp["device_category"] = df_tmp["device_category"].astype(int)
+    #
+    #     return df_tmp
 
     @staticmethod
     def train_and_save_forecast_models(training_data, frequency):
         """
         Perform the actual training for hourly data
         """
+        training_data.dropna(
+                        subset=["device_id", "site_id", "device_category"], inplace=True
+                    )
+
+        training_data["device_id"] = training_data["device_id"].astype(int)
+        training_data["site_id"] = training_data["site_id"].astype(int)
+        training_data["device_category"] = training_data["device_category"].astype(int)
+
         training_data["timestamp"] = pd.to_datetime(training_data["timestamp"])
         features = [
             c
diff --git a/src/airflow/airqo_etl_utils/tests/conftest.py b/src/airflow/airqo_etl_utils/tests/conftest.py
index e63ec4dab8..17433693da 100644
--- a/src/airflow/airqo_etl_utils/tests/conftest.py
+++ b/src/airflow/airqo_etl_utils/tests/conftest.py
@@ -13,7 +13,7 @@ def pytest_configure(config):
 class ForecastFixtures:
     @staticmethod
     @pytest.fixture(scope="session")
-    def example_data():
+    def preprocessing_sample_df():
         data = pd.DataFrame(
             {
                 "device_id": ["A", "B"],
@@ -26,6 +26,28 @@ def example_data():
         return data
 
 
+    @staticmethod
+    @pytest.fixture
+    def feat_eng_sample_df_daily():
+        data = {
+            "timestamp": pd.date_range(end = pd.Timestamp.now(), periods=365).tolist(),
+            "device_id": ["device1"] * 365,
+            "pm2_5": range(1, 366),
+        }
+        return pd.DataFrame(data)
+
+    @staticmethod
+    @pytest.fixture
+    def feat_eng_sample_df_hourly():
+        data = {
+            "timestamp": pd.date_range(end = pd.Timestamp.now(), periods=24*14, freq='H').tolist(),
+            "device_id": ["device1"] * 24*14,
+            "pm2_5": range(1, 24*14+1),
+        }
+        return pd.DataFrame(data)
+
+
+
 @pytest.fixture(scope="session")
 def mongo_fixture():
     from airqo_etl_utils.mongo_client import MongoClient
diff --git a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py
index 0e707711c4..7b56aa2a55 100644
--- a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py
+++ b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py
@@ -1,3 +1,4 @@
+import pandas as pd
 import pytest
 
 from airqo_etl_utils.ml_utils import ForecastUtils as FUtils
@@ -5,18 +6,84 @@
 
 
 class TestsForecasts(ForecastFixtures):
-    def test_preprocess_data_typical_case(self, example_data):
-        result = FUtils.preprocess_data(example_data, "daily")
+    # Preprocess data tests
+    def test_preprocess_data_typical_case(self, preprocessing_sample_df):
+        result = FUtils.preprocess_data(preprocessing_sample_df, "daily")
         assert "pm2_5" in result.columns
 
-    def test_preprocess_data_invalid_input(self, example_data):
-        df = example_data.drop(columns=["device_id"])
+    def test_preprocess_data_invalid_input(self, preprocessing_sample_df):
+        df = preprocessing_sample_df.drop(columns=["device_id"])
         with pytest.raises(ValueError):
             FUtils.preprocess_data(df, "daily")
 
-    def test_preprocess_data_invalid_timestamp(self, example_data):
-        # Invalid timestamp
-        df = example_data.copy()
+    def test_preprocess_data_invalid_timestamp(self, preprocessing_sample_df):
+        df = preprocessing_sample_df.copy()
         df["timestamp"] = "invalid"
         with pytest.raises(ValueError):
-            FUtils.preprocess_data(df, "daily")
\ No newline at end of file
+            FUtils.preprocess_data(df, "daily")
+
+    # Feature engineering tests
+    # get_lag_and_rolling_features tests
+
+    def test_empty_df(self):
+        with pytest.raises(ValueError, match="Empty dataframe provided"):
+            FUtils.get_lag_and_roll_features(pd.DataFrame(), "pm2_5", "daily")
+
+    def test_missing_columns(self, feat_eng_sample_df_daily):
+        del feat_eng_sample_df_daily[
+            "device_id"
+        ]  # Test for case where 'device_id' is missing
+        with pytest.raises(ValueError, match="Required columns missing"):
+            FUtils.get_lag_and_roll_features(feat_eng_sample_df_daily, "pm2_5", "daily")
+
+    def test_invalid_frequency(self, feat_eng_sample_df_daily):
+        with pytest.raises(ValueError, match="Invalid frequency"):
+            FUtils.get_lag_and_roll_features(
+                feat_eng_sample_df_daily, "pm2_5", "annually"
+            )
+
+    def test_hourly_freq(self, sample_hourly_dataframe):
+        hourly_df = FUtils.get_lag_and_roll_features(
+            sample_hourly_dataframe, "pm2_5", "hourly"
+        )
+        for s in [1, 2, 6, 12]:
+            assert f"pm2_5_last_{s}_hour" in hourly_df.columns
+        for s in [3, 6, 12, 24]:
+            for f in ["mean", "std", "median", "skew"]:
+                assert f"pm2_5_{f}_{s}_hour" in hourly_df.columns
+
+    def test_daily_freq(self, feat_eng_sample_df_daily):
+        daily_df = FUtils.get_lag_and_roll_features(
+            feat_eng_sample_df_daily, "pm2_5", "daily"
+        )
+        for s in [1, 2, 3, 7, 14]:
+            assert f"pm2_5_last_{s}_day" in daily_df.columns
+        for s in [2, 3, 7, 14]:
+            for f in ["mean", "std", "max", "min"]:
+                assert f"pm2_5_{f}_{s}_day" in daily_df.columns
+
+    def test_empty_df_for_time_and_cyclic_features(self):
+        with pytest.raises(ValueError, match="Empty dataframe provided"):
+            FUtils.get_time_and_cyclic_features(pd.DataFrame(), "daily")
+
+    def test_missing_columns_for_time_and_cyclic_features(self, feat_eng_sample_df_daily):
+        with pytest.raises(ValueError, match="Required columns missing"):
+            FUtils.get_time_and_cyclic_features(feat_eng_sample_df_daily, "daily")
+
+    def test_invalid_frequency_for_time_and_cyclic_features(self, feat_eng_sample_df_daily):
+        with pytest.raises(ValueError, match="Invalid frequency"):
+            FUtils.get_time_and_cyclic_features(feat_eng_sample_df_daily, "annually")
+
+# For 'daily' frequency
+    def test_daily_freq_for_time_and_cyclic_features(self, feat_eng_sample_df_daily):
+        daily_df = FUtils.get_time_and_cyclic_features(feat_eng_sample_df_daily, "daily")
+        for a in ["year", "month", "day", "dayofweek", "week"]:
+            for t in ["_sin", "_cos"]:
+                assert f"{a}{t}" in daily_df.columns
+
+# For 'hourly' frequency
+    def test_hourly_freq_for_time_and_cyclic_features(self, feat_eng_sample_df_hourly):
+        hourly_df = FUtils.get_time_and_cyclic_features(feat_eng_sample_df_hourly, "hourly")
+        for a in ["year", "month", "day", "dayofweek", "hour", "week"]:
+            for t in ["_sin", "_cos"]:
+                assert f"{a}{t}" in hourly_df.columns
diff --git a/src/airflow/dags/ml_training_jobs.py b/src/airflow/dags/ml_training_jobs.py
index a517d6a3a7..e5822d12cc 100644
--- a/src/airflow/dags/ml_training_jobs.py
+++ b/src/airflow/dags/ml_training_jobs.py
@@ -1,12 +1,13 @@
+from datetime import datetime
+
 from airflow.decorators import dag, task
+from dateutil.relativedelta import relativedelta
 
 from airqo_etl_utils.airflow_custom_utils import AirflowUtils
+from airqo_etl_utils.bigquery_api import BigQueryApi
 from airqo_etl_utils.config import configuration
-from airqo_etl_utils.ml_utils import ForecastUtils
 from airqo_etl_utils.date import date_to_str
-from dateutil.relativedelta import relativedelta
-from airqo_etl_utils.bigquery_api import BigQueryApi
-from datetime import datetime
+from airqo_etl_utils.ml_utils import ForecastUtils, DecodingUtils
 
 
 @dag(
@@ -32,10 +33,21 @@ def preprocess_training_data_for_hourly_forecast_model(data):
         return ForecastUtils.preprocess_data(data, "hourly")
 
     @task()
-    def feat_engineer_training_data_for_hourly_forecast_model(data):
-        return ForecastUtils.feature_eng_data(data, "pm2_5", "hourly", "train")
+    def get_hourly_lag_and_rolling_features(data):
+        return ForecastUtils.get_lag_and_roll_features(data, 'pm2_5', 'hourly')
+
+    @task()
+    def get_hourly_time_and_cyclic_features(data):
+        return ForecastUtils.get_time_and_cyclic_features(data, 'hourly')
 
     @task()
+    def get_location_features(data):
+        return ForecastUtils.get_location_features(data)
+
+    @task()
+    def encode_categorical_features(data):
+        return DecodingUtils.encode_categorical_training_features(data, 'daily')
+    @task()
     def train_and_save_hourly_forecast_model(train_data):
         return ForecastUtils.train_and_save_forecast_models(
             train_data, frequency="hourly"
@@ -59,21 +71,38 @@ def preprocess_training_data_for_daily_forecast_model(data):
         return ForecastUtils.preprocess_data(data, "daily")
 
     @task()
-    def feature_engineer_data_for_daily_forecast_model(data):
-        return ForecastUtils.feature_eng_data(data, "pm2_5", "daily", "train")
+    def get_daily_lag_and_rolling_features(data):
+        return ForecastUtils.get_lag_and_roll_features(data, "pm2_5", "daily")
+
+    @task()
+    def get_daily_time_and_cylic_features(data):
+        return  ForecastUtils.get_time_and_cyclic_features(data, 'daily')
+
+    @task()
+    def get_location_features(data):
+        return ForecastUtils.get_location_features(data)
 
     @task()
+    def encode_categorical_features(data):
+        return DecodingUtils.encode_categorical_training_features(data, 'daily')
+    @task()
     def train_and_save_daily_model(train_data):
         return ForecastUtils.train_and_save_forecast_models(train_data, "daily")
 
     hourly_data = fetch_training_data_for_hourly_forecast_model()
     hourly_data = preprocess_training_data_for_hourly_forecast_model(hourly_data)
-    hourly_data = feat_engineer_training_data_for_hourly_forecast_model(hourly_data)
+    hourly_data = get_hourly_lag_and_rolling_features(hourly_data)
+    hourly_data = get_hourly_time_and_cyclic_features(hourly_data)
+    hourly_data = get_location_features(hourly_data)
+    hourly_data = encode_categorical_features(hourly_data)
     train_and_save_hourly_forecast_model(hourly_data)
 
     daily_data = fetch_training_data_for_daily_forecast_model()
     daily_data = preprocess_training_data_for_daily_forecast_model(daily_data)
-    daily_data = feature_engineer_data_for_daily_forecast_model(daily_data)
+    daily_data = get_daily_lag_and_rolling_features(daily_data)
+    daily_data = get_daily_time_and_cylic_features(daily_data)
+    daily_data = get_location_features(daily_data)
+    daily_data = encode_categorical_features(daily_data)
     train_and_save_daily_model(daily_data)
 
 
From f18da9b5506bad7d4fd3b35d03f8240064032de4 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Sat, 16 Sep 2023 13:15:05 +0300
Subject: [PATCH 20/43] training job cleanup

---
 src/airflow/airqo_etl_utils/ml_utils.py       | 31 ++++++++++---
 src/airflow/airqo_etl_utils/tests/conftest.py | 15 ++++++-
 .../airqo_etl_utils/tests/ml_utils_tests.py   | 43 +++++++++++++++++++
 src/airflow/dev-requirements.txt              |  2 +-
 4 files changed, 83 insertions(+), 8 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index f35116c014..3f541662f7 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -22,6 +22,7 @@
 
 
 class GCSUtils:
+    """ Utility class for saving and retrieving models from GCS"""
     # TODO: In future, save and retrieve models from mlflow instead of GCS
     @staticmethod
     def get_trained_model_from_gcs(project_name, bucket_name, source_blob_name):
@@ -66,6 +67,7 @@ def get_mapping_from_gcs(project_name, bucket_name, source_blob_name):
 
 
 class DecodingUtils:
+    """ Utility class for encoding and decoding categorical features"""
     @staticmethod
     def decode_categorical_features_pred(df, frequency):
         columns = ["device_id", "site_id", "device_category"]
@@ -100,7 +102,7 @@ def decode_categorical_features_before_save(df, frequency):
 
     @staticmethod
     def encode_categorical_training_features(df, freq):
-        df["timestamp"] = pd.to_datetime("timestamp")
+        df["timestamp"] = pd.to_datetime(df["timestamp"])
         df1 = df.copy()
         columns = ["device_id", "site_id", "device_category"]
         mappings = []
@@ -169,12 +171,12 @@ def get_lag_and_roll_features(df, target_col, freq):
         
         df1 = df.copy()  # use copy to prevent terminal warning
         if freq == "daily":
-            shifts = [1, 2, 3, 7, 14]
+            shifts = [1, 2, 3, 7]
             for s in shifts:
                 df1[f"pm2_5_last_{s}_day"] = df1.groupby(["device_id"])[
                     target_col
                 ].shift(s)
-            shifts = [2, 3, 7, 14]
+            shifts = [2, 3, 7]
             functions = ["mean", "std", "max", "min"]
             for s in shifts:
                 for f in functions:
@@ -206,7 +208,16 @@ def get_lag_and_roll_features(df, target_col, freq):
 
     @staticmethod
     def get_time_and_cyclic_features(df, freq):
+        if df.empty:
+            raise ValueError("Empty dataframe provided")
+
+        if "timestamp" not in df.columns:
+            raise ValueError("Required columns missing")
+
+        df["timestamp"] = pd.to_datetime(df["timestamp"])
 
+        if freq not in ["daily", "hourly"]:
+            raise ValueError("Invalid frequency")
         df["timestamp"] = pd.to_datetime(df["timestamp"])
         df1 = df.copy()
         attributes = ["year", "month", "day", "dayofweek"]
@@ -227,7 +238,15 @@ def get_time_and_cyclic_features(df, freq):
 
     @staticmethod
     def get_location_features(df):
-        df["timestamp"] = pd.to_datetime(df)
+        if df.empty:
+            raise ValueError("Empty dataframe provided")
+    
+        for column_name in ["timestamp", "latitude", "longitude"]:
+            if column_name not in df.columns:
+                raise ValueError(f"{column_name} column is missing")
+    
+        df["timestamp"] = pd.to_datetime(df["timestamp"])
+    
         df["x_cord"] = np.cos(df["latitude"]) * np.cos(df["longitude"])
         df["y_cord"] = np.cos(df["latitude"]) * np.sin(df["longitude"])
         df["z_cord"] = np.sin(df["latitude"])
@@ -480,11 +499,11 @@ def get_forecasts(
                 # daily frequency
                 if frequency == "daily":
                     df_tmp.tail(1)["timestamp"] += timedelta(days=1)
-                    shifts1 = [1, 2, 3, 7, 14]
+                    shifts1 = [1, 2, 3, 7]
                     for s in shifts1:
                         df_tmp[f"pm2_5_last_{s}_day"] = df_tmp.shift(s, axis=0)["pm2_5"]
                     # rolling features
-                    shifts2 = [2, 3, 7, 14]
+                    shifts2 = [2, 3, 7]
                     functions = ["mean", "std", "max", "min"]
                     for s in shifts2:
                         for f in functions:
diff --git a/src/airflow/airqo_etl_utils/tests/conftest.py b/src/airflow/airqo_etl_utils/tests/conftest.py
index 17433693da..4f24ccd45e 100644
--- a/src/airflow/airqo_etl_utils/tests/conftest.py
+++ b/src/airflow/airqo_etl_utils/tests/conftest.py
@@ -1,5 +1,6 @@
 from datetime import datetime
 
+import numpy as np
 import pandas as pd
 import pytest
 
@@ -10,6 +11,8 @@ def pytest_configure(config):
     )
 
 
+
+
 class ForecastFixtures:
     @staticmethod
     @pytest.fixture(scope="session")
@@ -46,7 +49,17 @@ def feat_eng_sample_df_hourly():
         }
         return pd.DataFrame(data)
 
-
+    @staticmethod
+    @pytest.fixture
+    def sample_dataframe_for_location_features():
+        data = {
+            "timestamp": pd.date_range(end=pd.Timestamp.now(), periods=100)
+            .tolist(),
+            "device_id": ["device1"] * 100,
+            "latitude": np.random.uniform(-90, 90, 100),
+            "longitude": np.random.uniform(-180, 180, 100),
+        }
+        return pd.DataFrame(data)
 
 @pytest.fixture(scope="session")
 def mongo_fixture():
diff --git a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py
index 7b56aa2a55..ec7128d9e6 100644
--- a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py
+++ b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py
@@ -87,3 +87,46 @@ def test_hourly_freq_for_time_and_cyclic_features(self, feat_eng_sample_df_hourl
         for a in ["year", "month", "day", "dayofweek", "hour", "week"]:
             for t in ["_sin", "_cos"]:
                 assert f"{a}{t}" in hourly_df.columns
+
+    def test_empty_df_for_location_features(self, sample_dataframe_for_location_features):
+        with pytest.raises(ValueError, match="Empty dataframe provided"):
+            FUtils.get_location_features(pd.DataFrame())
+    
+    
+    def test_missing_timestamp_for_location_features(
+        self,
+        sample_dataframe_for_location_features,
+    ):
+        del sample_dataframe_for_location_features[
+            "timestamp"
+        ]
+        with pytest.raises(ValueError, match="timestamp column is missing"):
+            FUtils.get_location_features(sample_dataframe_for_location_features)
+    
+    
+    # For missing 'latitude' column
+    def test_missing_latitude_for_location_features(
+        self, sample_dataframe_for_location_features
+    ):
+        del sample_dataframe_for_location_features[
+            "latitude"
+        ]  # Test for missing 'latitude'
+        with pytest.raises(ValueError, match="latitude column is missing"):
+            FUtils.get_location_features(sample_dataframe_for_location_features)
+    
+    
+    def test_missing_longitude_for_location_features(
+        self, sample_dataframe_for_location_features
+    ):
+        del sample_dataframe_for_location_features[
+            "longitude"
+        ]  # Test for missing 'longitude'
+        with pytest.raises(ValueError, match="longitude column is missing"):
+            FUtils.get_location_features(sample_dataframe_for_location_features)
+    
+    
+    # Test the normal procedure
+    def test_get_location_features(self, sample_dataframe_for_location_features):
+        df = FUtils.get_location_features(sample_dataframe_for_location_features)
+        for cord in ["x_cord", "y_cord", "z_cord"]:
+            assert cord in df.columns
\ No newline at end of file
diff --git a/src/airflow/dev-requirements.txt b/src/airflow/dev-requirements.txt
index 81c23b0562..d37188bf8d 100644
--- a/src/airflow/dev-requirements.txt
+++ b/src/airflow/dev-requirements.txt
@@ -3,6 +3,7 @@ apache-airflow-providers-slack
 confluent-avro
 google-cloud-bigquery
 google-cloud-storage
+optuna
 pyarrow
 sentry-sdk
 pandas
@@ -17,5 +18,4 @@ db_dtypes
 mlflow
 lightgbm
 gcsfs
-pymongo
 pytest
\ No newline at end of file

From 8a6e8e77f42860ee7446a579aa6cedcf7ad98c64 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Mon, 18 Sep 2023 12:07:49 +0300
Subject: [PATCH 21/43] refactor forecast job DAG

---
 src/airflow/airqo_etl_utils/ml_utils.py |  1 -
 src/airflow/dags/ml_prediction_jobs.py  | 61 ++++++++++++++++++++-----
 2 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index 3f541662f7..d026a418c8 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -609,7 +609,6 @@ def get_forecasts(
             )
 
             forecasts = pd.concat([forecasts, device_forecasts], ignore_index=True)
-            print(device)
 
         forecasts["pm2_5"] = forecasts["pm2_5"].astype(float)
         # forecasts["margin_of_error"] = forecasts["margin_of_error"].astype(float)
diff --git a/src/airflow/dags/ml_prediction_jobs.py b/src/airflow/dags/ml_prediction_jobs.py
index 04972fccd6..20cd5593ca 100644
--- a/src/airflow/dags/ml_prediction_jobs.py
+++ b/src/airflow/dags/ml_prediction_jobs.py
@@ -34,8 +34,18 @@ def preprocess_historical_data_hourly_forecast(data):
         return ForecastUtils.preprocess_data(data, "hourly")
 
     @task
-    def feature_eng_hourly_historical_data(data):
-        return ForecastUtils.feature_eng_data(data, "pm2_5", "hourly", "predict")
+    def generate_lag_and_rolling_features_hourly_forecast(data):
+        return ForecastUtils.get_lag_and_roll_features(data, "pm2_5", "hourly")
+    
+    
+    @task()
+    def get_time_and_cyclic_features_hourly_forecast(data):
+        return ForecastUtils.get_time_and_cyclic_features(data, "hourly")
+    
+    
+    @task()
+    def get_location_features_hourly_forecast(data):
+        return ForecastUtils.get_location_features(data)
 
     @task()
     def make_hourly_forecasts(data):
@@ -70,12 +80,22 @@ def preprocess_historical_data_daily_forecast(data):
         return ForecastUtils.preprocess_data(data, "daily")
 
     @task()
-    def feature_engineer_daily_historical_data(data):
-        return ForecastUtils.feature_eng_data(data, "pm2_5", "daily", "predict")
+    def generate_lag_and_rolling_features_daily_forecast(data):
+        return ForecastUtils.get_lag_and_roll_features(data, "pm2_5", "daily")
+
+    @task()
+    def get_time_and_cyclic_features_daily_forecast(data):
+        return ForecastUtils.get_time_and_cyclic_features(data, "daily")
+
+    @task()
+    def get_location_features_daily_forecast(data):
+        return ForecastUtils.get_location_features(data)
 
     @task()
     def make_daily_forecasts(data):
-        return ForecastUtils.generate_forecasts(data, project_id, bucket, "daily")
+        return ForecastUtils.generate_forecasts(
+            data=data, project_name=project_id, bucket_name=bucket, frequency="daily"
+        )
 
     @task()
     def save_daily_forecasts_to_bigquery(data):
@@ -87,17 +107,36 @@ def save_daily_forecasts_to_bigquery(data):
     def save_daily_forecasts_to_mongo(data):
         ForecastUtils.save_forecasts_to_mongo(data, "daily")
 
+
+    # Hourly forecast pipeline
     hourly_data = get_historical_data_for_hourly_forecasts()
-    preprocessed_hourly_data = preprocess_historical_data_hourly_forecast(hourly_data)
-    feat_data = feature_eng_hourly_historical_data(preprocessed_hourly_data)
-    hourly_forecasts = make_hourly_forecasts(feat_data)
+    hourly_preprocessed_data = preprocess_historical_data_hourly_forecast(hourly_data)
+    hourly_lag_and_roll_features = generate_lag_and_rolling_features_hourly_forecast(
+        hourly_preprocessed_data
+    )
+    hourly_time_and_cyclic_features = get_time_and_cyclic_features_hourly_forecast(
+        hourly_lag_and_roll_features
+    )
+    hourly_location_features = get_location_features_hourly_forecast(
+        hourly_time_and_cyclic_features
+    )
+    hourly_forecasts = make_hourly_forecasts(hourly_location_features)
     save_hourly_forecasts_to_bigquery(hourly_forecasts)
     save_hourly_forecasts_to_mongo(hourly_forecasts)
 
+    # Daily forecast pipeline
     daily_data = get_historical_data_for_daily_forecasts()
-    preprocessed_daily_data = preprocess_historical_data_daily_forecast(daily_data)
-    feat_data = feature_engineer_daily_historical_data(preprocessed_daily_data)
-    daily_forecasts = make_daily_forecasts(feat_data)
+    daily_preprocessed_data = preprocess_historical_data_daily_forecast(daily_data)
+    daily_lag_and_roll_features = generate_lag_and_rolling_features_daily_forecast(
+        daily_preprocessed_data
+    )
+    daily_time_and_cyclic_features = get_time_and_cyclic_features_daily_forecast(
+        daily_lag_and_roll_features
+    )
+    daily_location_features = get_location_features_daily_forecast(
+        daily_time_and_cyclic_features
+    )
+    daily_forecasts = make_daily_forecasts(daily_location_features)
     save_daily_forecasts_to_bigquery(daily_forecasts)
     save_daily_forecasts_to_mongo(daily_forecasts)
 

From 01a6674aabd40da4342d10d096bc0daed0d12465 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 18 Sep 2023 12:20:18 +0300
Subject: [PATCH 22/43] Update AirQo exceedance production image tag to
 prod-74273167-1695028772

---
 k8s/exceedance/values-prod-airqo.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k8s/exceedance/values-prod-airqo.yaml b/k8s/exceedance/values-prod-airqo.yaml
index 835a06a025..8a9e92895e 100644
--- a/k8s/exceedance/values-prod-airqo.yaml
+++ b/k8s/exceedance/values-prod-airqo.yaml
@@ -4,6 +4,6 @@ app:
   configmap: env-exceedance-production
 image:
   repository: eu.gcr.io/airqo-250220/airqo-exceedance-job
-  tag: prod-d4165e1e-1695022368
+  tag: prod-74273167-1695028772
 nameOverride: ''
 fullnameOverride: ''

From 4233c133a606fa3ba2f50c9ef72c7564435ed01a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 18 Sep 2023 12:20:27 +0300
Subject: [PATCH 23/43] Update KCCA exceedance production image tag to
 prod-74273167-1695028772

---
 k8s/exceedance/values-prod-kcca.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k8s/exceedance/values-prod-kcca.yaml b/k8s/exceedance/values-prod-kcca.yaml
index d8306e316e..f1b68b74fe 100644
--- a/k8s/exceedance/values-prod-kcca.yaml
+++ b/k8s/exceedance/values-prod-kcca.yaml
@@ -4,6 +4,6 @@ app:
   configmap: env-exceedance-production
 image:
   repository: eu.gcr.io/airqo-250220/kcca-exceedance-job
-  tag: prod-d4165e1e-1695022368
+  tag: prod-74273167-1695028772
 nameOverride: ''
 fullnameOverride: ''

From 2474e9b2cea09c993bd864b9a256ceea55952f5f Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 18 Sep 2023 12:21:04 +0300
Subject: [PATCH 24/43] Update incentives production image tag to
 prod-74273167-1695028772

---
 k8s/incentives/values-prod.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k8s/incentives/values-prod.yaml b/k8s/incentives/values-prod.yaml
index 91b285ee99..2526ab40a1 100644
--- a/k8s/incentives/values-prod.yaml
+++ b/k8s/incentives/values-prod.yaml
@@ -6,7 +6,7 @@ app:
 replicaCount: 3
 image:
   repository: eu.gcr.io/airqo-250220/airqo-incentives-api
-  tag: prod-d4165e1e-1695022368
+  tag: prod-74273167-1695028772
 nameOverride: ''
 fullnameOverride: ''
 podAnnotations: {}

From f669ae119dcba6c4e55d67454673cdbd344e4904 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 18 Sep 2023 12:21:09 +0300
Subject: [PATCH 25/43] Update auth service staging image tag to
 stage-5513f226-1695028756

---
 k8s/auth-service/values-stage.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k8s/auth-service/values-stage.yaml b/k8s/auth-service/values-stage.yaml
index 8a1b64d1b8..341ee6b68a 100644
--- a/k8s/auth-service/values-stage.yaml
+++ b/k8s/auth-service/values-stage.yaml
@@ -6,7 +6,7 @@ app:
 replicaCount: 2
 image:
   repository: eu.gcr.io/airqo-250220/airqo-stage-auth-api
-  tag: stage-b17fbb54-1694524327
+  tag: stage-5513f226-1695028756
 nameOverride: ''
 fullnameOverride: ''
 podAnnotations: {}

From 0e8d9a7edfcb7b0cf93a41709a0a70b1c1e47b92 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 18 Sep 2023 12:21:57 +0300
Subject: [PATCH 26/43] Update device registry production image tag to
 prod-74273167-1695028772

---
 k8s/device-registry/values-prod.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k8s/device-registry/values-prod.yaml b/k8s/device-registry/values-prod.yaml
index ba45a082f1..11dfc9e0c2 100644
--- a/k8s/device-registry/values-prod.yaml
+++ b/k8s/device-registry/values-prod.yaml
@@ -6,7 +6,7 @@ app:
 replicaCount: 3
 image:
   repository: eu.gcr.io/airqo-250220/airqo-device-registry-api
-  tag: prod-80ea615f-1694585638
+  tag: prod-74273167-1695028772
 nameOverride: ''
 fullnameOverride: ''
 podAnnotations: {}

From 6d1b58d4c2f1717d55402308ab8f86810ed27f28 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 18 Sep 2023 12:22:09 +0300
Subject: [PATCH 27/43] Update auth service production image tag to
 prod-74273167-1695028772

---
 k8s/auth-service/values-prod.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k8s/auth-service/values-prod.yaml b/k8s/auth-service/values-prod.yaml
index 620252ed57..b7b8ea2057 100644
--- a/k8s/auth-service/values-prod.yaml
+++ b/k8s/auth-service/values-prod.yaml
@@ -6,7 +6,7 @@ app:
 replicaCount: 3
 image:
   repository: eu.gcr.io/airqo-250220/airqo-auth-api
-  tag: prod-d4165e1e-1695022368
+  tag: prod-74273167-1695028772
 nameOverride: ''
 fullnameOverride: ''
 podAnnotations: {}

From 2a75b4e27a64d954eea242041f24f9ad55f77fee Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 18 Sep 2023 12:22:48 +0300
Subject: [PATCH 28/43] Update analytics production image tag to
 prod-74273167-1695028772

---
 k8s/analytics/values-prod.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k8s/analytics/values-prod.yaml b/k8s/analytics/values-prod.yaml
index 56b7b6b3e5..aa95a02534 100644
--- a/k8s/analytics/values-prod.yaml
+++ b/k8s/analytics/values-prod.yaml
@@ -8,7 +8,7 @@ images:
     celeryWorker: eu.gcr.io/airqo-250220/airqo-analytics-celery-worker
     reportJob: eu.gcr.io/airqo-250220/airqo-analytics-report-job
     devicesSummaryJob: eu.gcr.io/airqo-250220/airqo-analytics-devices-summary-job
-  tag: prod-d4165e1e-1695022368
+  tag: prod-74273167-1695028772
 api:
   name: airqo-analytics-api
   label: analytics-api

From 280ecfb1464eb3b5d40413f73b1fae5abc9657c5 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 18 Sep 2023 12:23:04 +0300
Subject: [PATCH 29/43] Update airflow prod image tag to
 prod-74273167-1695028772

---
 k8s/airflow/values-prod.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k8s/airflow/values-prod.yaml b/k8s/airflow/values-prod.yaml
index f01e6e714a..1cee1ed482 100644
--- a/k8s/airflow/values-prod.yaml
+++ b/k8s/airflow/values-prod.yaml
@@ -9,7 +9,7 @@ images:
   repositories:
     initContainer: eu.gcr.io/airqo-250220/airqo-apache-airflow-xcom
     containers: eu.gcr.io/airqo-250220/airqo-apache-airflow
-  tag: prod-d4165e1e-1695022368
+  tag: prod-74273167-1695028772
 nameOverride: ''
 fullnameOverride: ''
 podAnnotations: {}

From 7cc6e2e7f188eeae2a67a96e099d64a04a026ef9 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 18 Sep 2023 12:24:48 +0300
Subject: [PATCH 30/43] Update predict production image tag to
 prod-74273167-1695028772

---
 k8s/predict/values-prod.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k8s/predict/values-prod.yaml b/k8s/predict/values-prod.yaml
index 3b64007dd0..cfe7b4056c 100644
--- a/k8s/predict/values-prod.yaml
+++ b/k8s/predict/values-prod.yaml
@@ -7,7 +7,7 @@ images:
     predictJob: eu.gcr.io/airqo-250220/airqo-predict-job
     trainJob: eu.gcr.io/airqo-250220/airqo-train-job
     predictPlaces: eu.gcr.io/airqo-250220/airqo-predict-places-air-quality
-  tag: prod-d4165e1e-1695022368
+  tag: prod-74273167-1695028772
 api:
   name: airqo-prediction-api
   label: prediction-api

From 0f22ce91b0b3186539ec3ff031f73452728ce327 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Mon, 18 Sep 2023 12:34:10 +0300
Subject: [PATCH 31/43] Run black formatter

---
 src/airflow/airqo_etl_utils/bigquery_api.py   |  2 -
 src/airflow/airqo_etl_utils/ml_utils.py       | 39 ++++++++++-------
 .../tests/big_query_api_tests.py              |  8 ++--
 src/airflow/airqo_etl_utils/tests/conftest.py | 17 ++++----
 .../airqo_etl_utils/tests/ml_utils_tests.py   | 42 ++++++++++---------
 src/airflow/dags/data_warehouse.py            |  5 ++-
 src/airflow/dags/ml_prediction_jobs.py        |  7 +---
 src/airflow/dags/ml_training_jobs.py          | 12 +++---
 8 files changed, 71 insertions(+), 61 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/bigquery_api.py b/src/airflow/airqo_etl_utils/bigquery_api.py
index 65317a0e77..2e66b9fc10 100644
--- a/src/airflow/airqo_etl_utils/bigquery_api.py
+++ b/src/airflow/airqo_etl_utils/bigquery_api.py
@@ -647,8 +647,6 @@ def fetch_data(
         except Exception as e:
             print("Error fetching data from bigquery")
 
-        
-
     @staticmethod
     def save_forecasts_to_bigquery(df, table):
         """saves the dataframes to the bigquery tables"""
diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index d026a418c8..811e267515 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -22,7 +22,8 @@
 
 
 class GCSUtils:
-    """ Utility class for saving and retrieving models from GCS"""
+    """Utility class for saving and retrieving models from GCS"""
+
     # TODO: In future, save and retrieve models from mlflow instead of GCS
     @staticmethod
     def get_trained_model_from_gcs(project_name, bucket_name, source_blob_name):
@@ -67,7 +68,8 @@ def get_mapping_from_gcs(project_name, bucket_name, source_blob_name):
 
 
 class DecodingUtils:
-    """ Utility class for encoding and decoding categorical features"""
+    """Utility class for encoding and decoding categorical features"""
+
     @staticmethod
     def decode_categorical_features_pred(df, frequency):
         columns = ["device_id", "site_id", "device_category"]
@@ -128,7 +130,13 @@ def encode_categorical_training_features(df, freq):
 class ForecastUtils:
     @staticmethod
     def preprocess_data(data, data_frequency):
-        required_columns = {"device_id", "site_id", "device_category", "pm2_5", "timestamp"}
+        required_columns = {
+            "device_id",
+            "site_id",
+            "device_category",
+            "pm2_5",
+            "timestamp",
+        }
         if not required_columns.issubset(data.columns):
             missing_columns = required_columns.difference(data.columns)
             raise ValueError(
@@ -158,17 +166,20 @@ def preprocess_data(data, data_frequency):
         data = data.dropna(subset=["pm2_5"])
         return data
 
-
     @staticmethod
     def get_lag_and_roll_features(df, target_col, freq):
         if df.empty:
             raise ValueError("Empty dataframe provided")
 
-        if target_col not in df.columns or "timestamp" not in df.columns or "device_id" not in df.columns:
+        if (
+            target_col not in df.columns
+            or "timestamp" not in df.columns
+            or "device_id" not in df.columns
+        ):
             raise ValueError("Required columns missing")
 
         df["timestamp"] = pd.to_datetime(df["timestamp"])
-        
+
         df1 = df.copy()  # use copy to prevent terminal warning
         if freq == "daily":
             shifts = [1, 2, 3, 7]
@@ -229,7 +240,7 @@ def get_time_and_cyclic_features(df, freq):
             df1[a] = df1["timestamp"].dt.__getattribute__(a)
             df1[a + "_sin"] = np.sin(2 * np.pi * df1[a] / m)
             df1[a + "_cos"] = np.cos(2 * np.pi * df1[a] / m)
-    
+
         df1["week"] = df1["timestamp"].dt.isocalendar().week
         df1["week_sin"] = np.sin(2 * np.pi * df1["week"] / 52)
         df1["week_cos"] = np.cos(2 * np.pi * df1["week"] / 52)
@@ -240,17 +251,17 @@ def get_time_and_cyclic_features(df, freq):
     def get_location_features(df):
         if df.empty:
             raise ValueError("Empty dataframe provided")
-    
+
         for column_name in ["timestamp", "latitude", "longitude"]:
             if column_name not in df.columns:
                 raise ValueError(f"{column_name} column is missing")
-    
+
         df["timestamp"] = pd.to_datetime(df["timestamp"])
-    
+
         df["x_cord"] = np.cos(df["latitude"]) * np.cos(df["longitude"])
         df["y_cord"] = np.cos(df["latitude"]) * np.sin(df["longitude"])
         df["z_cord"] = np.sin(df["latitude"])
-    
+
         return df
 
     #     df_tmp = get_lag_features(df_tmp, target_column, data_frequency)
@@ -280,8 +291,8 @@ def train_and_save_forecast_models(training_data, frequency):
         Perform the actual training for hourly data
         """
         training_data.dropna(
-                        subset=["device_id", "site_id", "device_category"], inplace=True
-                    )
+            subset=["device_id", "site_id", "device_category"], inplace=True
+        )
 
         training_data["device_id"] = training_data["device_id"].astype(int)
         training_data["site_id"] = training_data["site_id"].astype(int)
@@ -583,7 +594,7 @@ def get_forecasts(
                 #     + df_tmp.loc[df_tmp.index[-1], "margin_of_error"]
                 # )
 
-            return df_tmp.iloc[-int(horizon):, :]
+            return df_tmp.iloc[-int(horizon) :, :]
 
         forecasts = pd.DataFrame()
         forecast_model = GCSUtils.get_trained_model_from_gcs(
diff --git a/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py b/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py
index da4a20e4f4..2be61e9415 100644
--- a/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py
+++ b/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py
@@ -54,9 +54,7 @@ def fake_query(query, job_config):
         fake_job = mock.Mock()
 
         if "2023-01-01" in query:
-            fake_job.result.return_value.to_dataframe.return_value = (
-                sample_df
-            )
+            fake_job.result.return_value.to_dataframe.return_value = sample_df
         elif "2023-01-02" in query:
             fake_job.result.return_value.to_dataframe.return_value = (
                 fake_data_empty_result
@@ -72,6 +70,7 @@ def fake_query(query, job_config):
 
     return fake_client
 
+
 @pytest.mark.parametrize(
     "start_date_time, expected_df",
     [
@@ -98,7 +97,6 @@ def fake_query(query, job_config):
     ],
 )
 def test_fetch_data_correct_se(mock_bigquery_client2, start_date_time, expected_df):
-
     """Tests the fetch_data method for scenarios when correct data is retrieved."""
 
     bq_api = BigQueryApi()
@@ -118,6 +116,7 @@ def test_fetch_data_invalid_date(mock_bigquery_client2, start_date_time):
     with pytest.raises(ValueError):
         bq_api.fetch_data(start_date_time)
 
+
 @pytest.mark.parametrize("start_date_time", ["2023-01-03"])
 def test_fetch_data_bigquery_error(mock_bigquery_client2, start_date_time):
     """Tests the fetch_data method for the scenario where a bigquery.GoogleAPIError is raised."""
@@ -130,7 +129,6 @@ def test_fetch_data_bigquery_error(mock_bigquery_client2, start_date_time):
         bq_api.fetch_data(start_date_time)
 
 
-
 def test_fetch_raw_readings_empty(mock_bigquery_client):
     api = BigQueryApi()
     api.client = mock_bigquery_client
diff --git a/src/airflow/airqo_etl_utils/tests/conftest.py b/src/airflow/airqo_etl_utils/tests/conftest.py
index 4f24ccd45e..cdbc784dc7 100644
--- a/src/airflow/airqo_etl_utils/tests/conftest.py
+++ b/src/airflow/airqo_etl_utils/tests/conftest.py
@@ -11,8 +11,6 @@ def pytest_configure(config):
     )
 
 
-
-
 class ForecastFixtures:
     @staticmethod
     @pytest.fixture(scope="session")
@@ -28,12 +26,11 @@ def preprocessing_sample_df():
         )
         return data
 
-
     @staticmethod
     @pytest.fixture
     def feat_eng_sample_df_daily():
         data = {
-            "timestamp": pd.date_range(end = pd.Timestamp.now(), periods=365).tolist(),
+            "timestamp": pd.date_range(end=pd.Timestamp.now(), periods=365).tolist(),
             "device_id": ["device1"] * 365,
             "pm2_5": range(1, 366),
         }
@@ -43,9 +40,11 @@ def feat_eng_sample_df_daily():
     @pytest.fixture
     def feat_eng_sample_df_hourly():
         data = {
-            "timestamp": pd.date_range(end = pd.Timestamp.now(), periods=24*14, freq='H').tolist(),
-            "device_id": ["device1"] * 24*14,
-            "pm2_5": range(1, 24*14+1),
+            "timestamp": pd.date_range(
+                end=pd.Timestamp.now(), periods=24 * 14, freq="H"
+            ).tolist(),
+            "device_id": ["device1"] * 24 * 14,
+            "pm2_5": range(1, 24 * 14 + 1),
         }
         return pd.DataFrame(data)
 
@@ -53,14 +52,14 @@ def feat_eng_sample_df_hourly():
     @pytest.fixture
     def sample_dataframe_for_location_features():
         data = {
-            "timestamp": pd.date_range(end=pd.Timestamp.now(), periods=100)
-            .tolist(),
+            "timestamp": pd.date_range(end=pd.Timestamp.now(), periods=100).tolist(),
             "device_id": ["device1"] * 100,
             "latitude": np.random.uniform(-90, 90, 100),
             "longitude": np.random.uniform(-180, 180, 100),
         }
         return pd.DataFrame(data)
 
+
 @pytest.fixture(scope="session")
 def mongo_fixture():
     from airqo_etl_utils.mongo_client import MongoClient
diff --git a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py
index ec7128d9e6..f07f58c908 100644
--- a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py
+++ b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py
@@ -66,44 +66,50 @@ def test_empty_df_for_time_and_cyclic_features(self):
         with pytest.raises(ValueError, match="Empty dataframe provided"):
             FUtils.get_time_and_cyclic_features(pd.DataFrame(), "daily")
 
-    def test_missing_columns_for_time_and_cyclic_features(self, feat_eng_sample_df_daily):
+    def test_missing_columns_for_time_and_cyclic_features(
+        self, feat_eng_sample_df_daily
+    ):
         with pytest.raises(ValueError, match="Required columns missing"):
             FUtils.get_time_and_cyclic_features(feat_eng_sample_df_daily, "daily")
 
-    def test_invalid_frequency_for_time_and_cyclic_features(self, feat_eng_sample_df_daily):
+    def test_invalid_frequency_for_time_and_cyclic_features(
+        self, feat_eng_sample_df_daily
+    ):
         with pytest.raises(ValueError, match="Invalid frequency"):
             FUtils.get_time_and_cyclic_features(feat_eng_sample_df_daily, "annually")
 
-# For 'daily' frequency
+    # For 'daily' frequency
     def test_daily_freq_for_time_and_cyclic_features(self, feat_eng_sample_df_daily):
-        daily_df = FUtils.get_time_and_cyclic_features(feat_eng_sample_df_daily, "daily")
+        daily_df = FUtils.get_time_and_cyclic_features(
+            feat_eng_sample_df_daily, "daily"
+        )
         for a in ["year", "month", "day", "dayofweek", "week"]:
             for t in ["_sin", "_cos"]:
                 assert f"{a}{t}" in daily_df.columns
 
-# For 'hourly' frequency
+    # For 'hourly' frequency
     def test_hourly_freq_for_time_and_cyclic_features(self, feat_eng_sample_df_hourly):
-        hourly_df = FUtils.get_time_and_cyclic_features(feat_eng_sample_df_hourly, "hourly")
+        hourly_df = FUtils.get_time_and_cyclic_features(
+            feat_eng_sample_df_hourly, "hourly"
+        )
         for a in ["year", "month", "day", "dayofweek", "hour", "week"]:
             for t in ["_sin", "_cos"]:
                 assert f"{a}{t}" in hourly_df.columns
 
-    def test_empty_df_for_location_features(self, sample_dataframe_for_location_features):
+    def test_empty_df_for_location_features(
+        self, sample_dataframe_for_location_features
+    ):
         with pytest.raises(ValueError, match="Empty dataframe provided"):
             FUtils.get_location_features(pd.DataFrame())
-    
-    
+
     def test_missing_timestamp_for_location_features(
         self,
         sample_dataframe_for_location_features,
     ):
-        del sample_dataframe_for_location_features[
-            "timestamp"
-        ]
+        del sample_dataframe_for_location_features["timestamp"]
         with pytest.raises(ValueError, match="timestamp column is missing"):
             FUtils.get_location_features(sample_dataframe_for_location_features)
-    
-    
+
     # For missing 'latitude' column
     def test_missing_latitude_for_location_features(
         self, sample_dataframe_for_location_features
@@ -113,8 +119,7 @@ def test_missing_latitude_for_location_features(
         ]  # Test for missing 'latitude'
         with pytest.raises(ValueError, match="latitude column is missing"):
             FUtils.get_location_features(sample_dataframe_for_location_features)
-    
-    
+
     def test_missing_longitude_for_location_features(
         self, sample_dataframe_for_location_features
     ):
@@ -123,10 +128,9 @@ def test_missing_longitude_for_location_features(
         ]  # Test for missing 'longitude'
         with pytest.raises(ValueError, match="longitude column is missing"):
             FUtils.get_location_features(sample_dataframe_for_location_features)
-    
-    
+
     # Test the normal procedure
     def test_get_location_features(self, sample_dataframe_for_location_features):
         df = FUtils.get_location_features(sample_dataframe_for_location_features)
         for cord in ["x_cord", "y_cord", "z_cord"]:
-            assert cord in df.columns
\ No newline at end of file
+            assert cord in df.columns
diff --git a/src/airflow/dags/data_warehouse.py b/src/airflow/dags/data_warehouse.py
index af9c20a70b..09316c73e1 100644
--- a/src/airflow/dags/data_warehouse.py
+++ b/src/airflow/dags/data_warehouse.py
@@ -142,7 +142,6 @@ def load(data: pd.DataFrame):
     load(clean_consolidated_data)
 
 
-
 @dag(
     "Historical-Consolidated-Data-ETL",
     schedule=None,
@@ -185,7 +184,7 @@ def extract_hourly_weather_data(**kwargs):
         from airqo_etl_utils.date import DateUtils
 
         start_date_time, end_date_time = DateUtils.get_dag_date_time_values(
-             historical=True, **kwargs
+            historical=True, **kwargs
         )
 
         return DataWarehouseUtils.extract_hourly_weather_data(
@@ -238,6 +237,7 @@ def load(data: pd.DataFrame):
     )
     load(merged_data)
 
+
 @dag(
     "Historical-Cleanup-Consolidated-Data",
     schedule=None,
@@ -280,6 +280,7 @@ def load(data: pd.DataFrame):
     clean_consolidated_data = remove_duplicates(consolidated_data)
     load(clean_consolidated_data)
 
+
 data_warehouse_consolidated_data()
 data_warehouse_cleanup_consolidated_data()
 data_warehouse_historical_consolidated_data()
diff --git a/src/airflow/dags/ml_prediction_jobs.py b/src/airflow/dags/ml_prediction_jobs.py
index 20cd5593ca..9dd0be4ec1 100644
--- a/src/airflow/dags/ml_prediction_jobs.py
+++ b/src/airflow/dags/ml_prediction_jobs.py
@@ -36,13 +36,11 @@ def preprocess_historical_data_hourly_forecast(data):
     @task
     def generate_lag_and_rolling_features_hourly_forecast(data):
         return ForecastUtils.get_lag_and_roll_features(data, "pm2_5", "hourly")
-    
-    
+
     @task()
     def get_time_and_cyclic_features_hourly_forecast(data):
         return ForecastUtils.get_time_and_cyclic_features(data, "hourly")
-    
-    
+
     @task()
     def get_location_features_hourly_forecast(data):
         return ForecastUtils.get_location_features(data)
@@ -107,7 +105,6 @@ def save_daily_forecasts_to_bigquery(data):
     def save_daily_forecasts_to_mongo(data):
         ForecastUtils.save_forecasts_to_mongo(data, "daily")
 
-
     # Hourly forecast pipeline
     hourly_data = get_historical_data_for_hourly_forecasts()
     hourly_preprocessed_data = preprocess_historical_data_hourly_forecast(hourly_data)
diff --git a/src/airflow/dags/ml_training_jobs.py b/src/airflow/dags/ml_training_jobs.py
index e5822d12cc..40c563e86e 100644
--- a/src/airflow/dags/ml_training_jobs.py
+++ b/src/airflow/dags/ml_training_jobs.py
@@ -34,11 +34,11 @@ def preprocess_training_data_for_hourly_forecast_model(data):
 
     @task()
     def get_hourly_lag_and_rolling_features(data):
-        return ForecastUtils.get_lag_and_roll_features(data, 'pm2_5', 'hourly')
+        return ForecastUtils.get_lag_and_roll_features(data, "pm2_5", "hourly")
 
     @task()
     def get_hourly_time_and_cyclic_features(data):
-        return ForecastUtils.get_time_and_cyclic_features(data, 'hourly')
+        return ForecastUtils.get_time_and_cyclic_features(data, "hourly")
 
     @task()
     def get_location_features(data):
@@ -46,7 +46,8 @@ def get_location_features(data):
 
     @task()
     def encode_categorical_features(data):
-        return DecodingUtils.encode_categorical_training_features(data, 'daily')
+        return DecodingUtils.encode_categorical_training_features(data, "daily")
+
     @task()
     def train_and_save_hourly_forecast_model(train_data):
         return ForecastUtils.train_and_save_forecast_models(
@@ -76,7 +77,7 @@ def get_daily_lag_and_rolling_features(data):
 
     @task()
     def get_daily_time_and_cylic_features(data):
-        return  ForecastUtils.get_time_and_cyclic_features(data, 'daily')
+        return ForecastUtils.get_time_and_cyclic_features(data, "daily")
 
     @task()
     def get_location_features(data):
@@ -84,7 +85,8 @@ def get_location_features(data):
 
     @task()
     def encode_categorical_features(data):
-        return DecodingUtils.encode_categorical_training_features(data, 'daily')
+        return DecodingUtils.encode_categorical_training_features(data, "daily")
+
     @task()
     def train_and_save_daily_model(train_data):
         return ForecastUtils.train_and_save_forecast_models(train_data, "daily")

From 75be7a5a7385ea5e5dd5f080085209905e3d57a6 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Mon, 18 Sep 2023 13:05:11 +0300
Subject: [PATCH 32/43] Update ml_prediction_jobs.py

---
 src/airflow/dags/ml_prediction_jobs.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/airflow/dags/ml_prediction_jobs.py b/src/airflow/dags/ml_prediction_jobs.py
index 9dd0be4ec1..e602a0e3c1 100644
--- a/src/airflow/dags/ml_prediction_jobs.py
+++ b/src/airflow/dags/ml_prediction_jobs.py
@@ -3,7 +3,7 @@
 from airqo_etl_utils.airflow_custom_utils import AirflowUtils
 from airqo_etl_utils.bigquery_api import BigQueryApi
 from airqo_etl_utils.config import configuration
-from airqo_etl_utils.ml_utils import ForecastUtils
+from airqo_etl_utils.ml_utils import ForecastUtils, DecodingUtils
 
 
 @dag(
@@ -45,6 +45,10 @@ def get_time_and_cyclic_features_hourly_forecast(data):
     def get_location_features_hourly_forecast(data):
         return ForecastUtils.get_location_features(data)
 
+    @task()
+    def encode_hourly_categorical_features(data):
+        return DecodingUtils.decode_categorical_features_pred(data, "hourly")
+
     @task()
     def make_hourly_forecasts(data):
         return ForecastUtils.generate_forecasts(
@@ -90,6 +94,9 @@ def get_location_features_daily_forecast(data):
         return ForecastUtils.get_location_features(data)
 
     @task()
+    def encode_daily_categorical_features(data):
+        return DecodingUtils.decode_categorical_features_pred(data, "daily")
+    @task()
     def make_daily_forecasts(data):
         return ForecastUtils.generate_forecasts(
             data=data, project_name=project_id, bucket_name=bucket, frequency="daily"
@@ -117,7 +124,10 @@ def save_daily_forecasts_to_mongo(data):
     hourly_location_features = get_location_features_hourly_forecast(
         hourly_time_and_cyclic_features
     )
-    hourly_forecasts = make_hourly_forecasts(hourly_location_features)
+    hourly_encoded_features = encode_hourly_categorical_features(
+        hourly_location_features
+    )
+    hourly_forecasts = make_hourly_forecasts(hourly_encoded_features)
     save_hourly_forecasts_to_bigquery(hourly_forecasts)
     save_hourly_forecasts_to_mongo(hourly_forecasts)
 
@@ -133,7 +143,10 @@ def save_daily_forecasts_to_mongo(data):
     daily_location_features = get_location_features_daily_forecast(
         daily_time_and_cyclic_features
     )
-    daily_forecasts = make_daily_forecasts(daily_location_features)
+    daily_encoded_features = encode_daily_categorical_features(
+        daily_location_features
+    )
+    daily_forecasts = make_daily_forecasts(daily_encoded_features)
     save_daily_forecasts_to_bigquery(daily_forecasts)
     save_daily_forecasts_to_mongo(daily_forecasts)
 

From ccb9ee22e060b1eadae3003b3bb9bec114ebc701 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 18 Sep 2023 15:14:07 +0300
Subject: [PATCH 33/43] Update airflow staging image tag to
 stage-defae719-1695039035

---
 k8s/airflow/values-stage.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k8s/airflow/values-stage.yaml b/k8s/airflow/values-stage.yaml
index 20127f0f6d..0f8ec9a229 100644
--- a/k8s/airflow/values-stage.yaml
+++ b/k8s/airflow/values-stage.yaml
@@ -9,7 +9,7 @@ images:
   repositories:
     initContainer: eu.gcr.io/airqo-250220/airqo-stage-apache-airflow-xcom
     containers: eu.gcr.io/airqo-250220/airqo-stage-apache-airflow
-  tag: stage-d1aaf3c2-1694766672
+  tag: stage-defae719-1695039035
 nameOverride: ''
 fullnameOverride: ''
 podAnnotations: {}

From 761aad8cd9f36675f52af9a8517ba5ae1efce1a8 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 18 Sep 2023 15:15:07 +0300
Subject: [PATCH 34/43] Update predict staging image tag to
 stage-defae719-1695039035

---
 k8s/predict/values-stage.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k8s/predict/values-stage.yaml b/k8s/predict/values-stage.yaml
index 79f9f7908d..0c1220227a 100644
--- a/k8s/predict/values-stage.yaml
+++ b/k8s/predict/values-stage.yaml
@@ -7,7 +7,7 @@ images:
     predictJob: eu.gcr.io/airqo-250220/stage-airqo-predict-job
     trainJob: eu.gcr.io/airqo-250220/stage-airqo-train-job
     predictPlaces: eu.gcr.io/airqo-250220/stage-airqo-predict-places-air-quality
-  tag: stage-84518356-1693167908
+  tag: stage-defae719-1695039035
 api:
   name: airqo-stage-prediction-api
   label: prediction-api

From e010fe97e50771d0de11b1abf699a39c4e43625b Mon Sep 17 00:00:00 2001
From: Benjamin Ssempala <86492979+BenjaminSsempala@users.noreply.github.com>
Date: Tue, 19 Sep 2023 14:28:28 +0300
Subject: [PATCH 35/43] Add Translation for Lessons and Quizzes

---
 src/device-registry/routes/v2/kya.js          |  40 +++
 src/device-registry/utils/create-event.js     |   2 +-
 .../utils/create-health-tips.js               |   2 +-
 .../utils/create-know-your-air.js             |  15 ++
 .../utils/test/ut_create-know-your-air.js     |  43 ++++
 .../utils/test/ut_translate.js                | 229 +++++++++++++++---
 src/device-registry/utils/translate.js        |  97 +++++++-
 7 files changed, 386 insertions(+), 42 deletions(-)

diff --git a/src/device-registry/routes/v2/kya.js b/src/device-registry/routes/v2/kya.js
index b1158a28eb..5cfff363cc 100644
--- a/src/device-registry/routes/v2/kya.js
+++ b/src/device-registry/routes/v2/kya.js
@@ -79,6 +79,16 @@ router.get(
         }),
     ],
   ]),
+  oneOf([
+    [
+      query("language")
+        .optional()
+        .notEmpty()
+        .withMessage("the language cannot be empty when provided")
+        .bail()
+        .trim()
+    ],
+  ]),
   knowYourAirController.listLessons
 );
 
@@ -97,6 +107,16 @@ router.get(
         .withMessage("the tenant value is not among the expected ones"),
     ],
   ]),
+  oneOf([
+    [
+      query("language")
+        .optional()
+        .notEmpty()
+        .withMessage("the language cannot be empty when provided")
+        .bail()
+        .trim()
+    ],
+  ]),
 
   oneOf([
     [
@@ -1183,6 +1203,16 @@ router.get(
         }),
     ],
   ]),
+  oneOf([
+    [
+      query("language")
+        .optional()
+        .notEmpty()
+        .withMessage("the language cannot be empty when provided")
+        .bail()
+        .trim()
+    ],
+  ]),
   knowYourAirController.listQuizzes
 );
 
@@ -1201,6 +1231,16 @@ router.get(
         .withMessage("the tenant value is not among the expected ones"),
     ],
   ]),
+  oneOf([
+    [
+      query("language")
+        .optional()
+        .notEmpty()
+        .withMessage("the language cannot be empty when provided")
+        .bail()
+        .trim()
+    ],
+  ]),
 
   oneOf([
     [
diff --git a/src/device-registry/utils/create-event.js b/src/device-registry/utils/create-event.js
index 48d8144797..26b76022e1 100644
--- a/src/device-registry/utils/create-event.js
+++ b/src/device-registry/utils/create-event.js
@@ -595,7 +595,7 @@ const createEvent = {
                 if (language !== undefined && constants.ENVIRONMENT === "STAGING ENVIRONMENT") {
                   let data = responseFromListEvents.data[0].data;
                   for (const event of data) {
-                    let translatedHealthTips = await translateUtil.translate(event.health_tips, language);
+                    let translatedHealthTips = await translateUtil.translateTips(event.health_tips, language);
                     if (translatedHealthTips.success === true) {
                       event.health_tips = translatedHealthTips.data;
                     }
diff --git a/src/device-registry/utils/create-health-tips.js b/src/device-registry/utils/create-health-tips.js
index 87a198b98b..fdac17a412 100644
--- a/src/device-registry/utils/create-health-tips.js
+++ b/src/device-registry/utils/create-health-tips.js
@@ -34,7 +34,7 @@ const createHealthTips = {
         skip,
       });
       if (language !== undefined) {
-        translatedHealthTips = await translateUtil.translate(responseFromListHealthTips.data, language);
+        translatedHealthTips = await translateUtil.translateTips(responseFromListHealthTips.data, language);
         responseFromListHealthTips = translatedHealthTips;
       }
 
diff --git a/src/device-registry/utils/create-know-your-air.js b/src/device-registry/utils/create-know-your-air.js
index 10a02af088..5d8a821862 100644
--- a/src/device-registry/utils/create-know-your-air.js
+++ b/src/device-registry/utils/create-know-your-air.js
@@ -13,6 +13,7 @@ const { logObject, logElement, logText } = require("./log");
 const generateFilter = require("./generate-filter");
 const log4js = require("log4js");
 const logger = log4js.getLogger(`${constants.ENVIRONMENT} -- create-kya-util`);
+const translateUtil = require("./translate");
 
 const mongoose = require("mongoose").set("debug", true);
 const ObjectId = mongoose.Types.ObjectId;
@@ -44,6 +45,7 @@ const createKnowYourAir = {
       const { user_id } = request.params;
       const limit = parseInt(request.query.limit, 0);
       const skip = parseInt(request.query.skip, 0);
+      const language = request.query.language;
       const filter = generateFilter.kyalessons(request);
       if (filter.success && filter.success === false) {
         return filter;
@@ -57,6 +59,12 @@ const createKnowYourAir = {
           user_id: user_id,
         }
       );
+      if (language !== undefined) {
+        const translatedLessons = await translateUtil.translateLessons(responseFromListLessons.data, language);
+        if (translatedLessons.success === true) {
+          return translatedLessons;
+        }
+      }
       logObject("responseFromListLessons", responseFromListLessons);
       return responseFromListLessons;
     } catch (error) {
@@ -938,6 +946,7 @@ const createKnowYourAir = {
       const { user_id } = request.params;
       const limit = parseInt(request.query.limit, 0);
       const skip = parseInt(request.query.skip, 0);
+      const language = request.query.language;
       const filter = generateFilter.kyaquizzes(request);
       if (filter.success && filter.success === false) {
         return filter;
@@ -949,6 +958,12 @@ const createKnowYourAir = {
         skip,
         user_id: user_id,
       });
+      if (language !== undefined) {
+        const translatedQuizzes = await translateUtil.translateQuizzes(responseFromListQuizzes.data, language);
+        if (translatedQuizzes.success === true) {
+          return translatedQuizzes;
+        }
+      }
       logObject("responseFromListQuizzes", responseFromListQuizzes);
       return responseFromListQuizzes;
     } catch (error) {
diff --git a/src/device-registry/utils/test/ut_create-know-your-air.js b/src/device-registry/utils/test/ut_create-know-your-air.js
index fcb5499776..2c934b1d44 100644
--- a/src/device-registry/utils/test/ut_create-know-your-air.js
+++ b/src/device-registry/utils/test/ut_create-know-your-air.js
@@ -42,6 +42,28 @@ describe("createKnowYourAir Utility Functions", () => {
       listStub.restore();
     });
 
+    it("should return a list of translated lessons successfully", async () => {
+      const request = {
+        query: { tenant: "your-tenant" },
+        params: { user_id: "user-id" },
+        query: { limit: 10, skip: 0, language: "fr" },
+      };
+
+      // Stub KnowYourAirLessonModel.list
+      const listStub = sinon
+        .stub(KnowYourAirLessonModel("your-tenant"), "list")
+        .resolves({ success: true, data: [], status: httpStatus.OK });
+
+      const result = await createKnowYourAir.listLesson(request);
+
+      expect(result.success).to.be.true;
+      expect(result.data).to.deep.equal([]);
+      expect(result.status).to.equal(httpStatus.OK);
+
+      // Restore the stub
+      listStub.restore();
+    });
+
     it("should handle filter failure", async () => {
       const request = {
         query: { tenant: "your-tenant" },
@@ -1639,6 +1661,27 @@ describe("createKnowYourAir Utility Functions", () => {
       KnowYourAirQuizModel("your-tenant").list.restore();
     });
 
+    it("should list translated quizzes", async () => {
+      const request = {
+        query: { tenant: "your-tenant" },
+        params: { user_id: "user-id" },
+        query: { limit: 10, skip: 0, language: "fr" },
+      };
+
+      // Stub KnowYourAirQuizModel(tenant).list to return quiz data
+      const quizListStub = sinon
+        .stub(KnowYourAirQuizModel("your-tenant"), "list")
+        .resolves({ success: true /* other response properties */ });
+
+      const result = await createKnowYourAir.listQuiz(request);
+
+      expect(result.success).to.be.true;
+      // Your other assertions here
+
+      // Restore the stub
+      KnowYourAirQuizModel("your-tenant").list.restore();
+    });
+
     it("should handle filter failure", async () => {
       const request = {
         query: { tenant: "your-tenant" },
diff --git a/src/device-registry/utils/test/ut_translate.js b/src/device-registry/utils/test/ut_translate.js
index 665dff3b4f..3c990a4125 100644
--- a/src/device-registry/utils/test/ut_translate.js
+++ b/src/device-registry/utils/test/ut_translate.js
@@ -9,45 +9,195 @@ const httpStatus = require("http-status");
 const translateUtil = require("@utils/translate");
 
 describe('translateUtil', () => {
-    it('should translate health tips to the target language', async () => {
-        const healthTips = [
-            {
-                title: 'Hello',
-                description: 'World',
-            },
-            {
-                title: 'Good',
-                description: 'Morning',
-            },
-        ];
-        const targetLanguage = 'fr';
-
-        const expectedTranslations = [
-            {
-                title: 'Bonjour',
-                description: 'Monde',
-            },
-            {
-                title: 'Bien',
-                description: 'Matin',
-            },
-        ];
-
-        const result = await translateUtil.translate(healthTips, targetLanguage);
-
-
-        expect(result).to.have.property('success', true);
-        for (let i = 0; i < result.data.length; i++) {
-            expect(result.data[i].title).to.equal(expectedTranslations[i].title);
-            expect(result.data[i].description).to.equal(expectedTranslations[i].description);
-        }
-    }).timeout(10000);
-
-    it('should handle translation errors gracefully', async () => {
-
-        const healthTips = null;
-        const targetLanguage = 'fr';
-        const result = await translateUtil.translate(healthTips, targetLanguage);
+    describe("translateTips", () => { 
+        it('should translate health tips to the target language', async () => {
+            const healthTips = [
+                {
+                    title: 'Hello',
+                    description: 'World',
+                },
+                {
+                    title: 'Good',
+                    description: 'Morning',
+                },
+            ];
+            const targetLanguage = 'fr';
+
+            const expectedTranslations = [
+                {
+                    title: 'Bonjour',
+                    description: 'Monde',
+                },
+                {
+                    title: 'Bien',
+                    description: 'Matin',
+                },
+            ];
+
+            const result = await translateUtil.translateTips(healthTips, targetLanguage);
+
+
+            expect(result).to.have.property('success', true);
+            for (let i = 0; i < result.data.length; i++) {
+                expect(result.data[i].title).to.equal(expectedTranslations[i].title);
+                expect(result.data[i].description).to.equal(expectedTranslations[i].description);
+            }
+        }).timeout(10000);
+
+        it('should handle translation errors gracefully', async () => {
+
+            const healthTips = null;
+            const targetLanguage = 'fr';
+            const result = await translateUtil.translateTips(healthTips, targetLanguage);
+
+            expect(result).to.have.property('success', false);
+            expect(result).to.have.property('message', 'Internal Server Error');
+            expect(result).to.have.property('status', 500);
+            expect(result).to.have.property('errors');
+            expect(result.errors).to.have.property('message');
+        });
+    })
+
+    describe("translateLessons", () => {
+        it('should translate Kya lessons to the target language', async () => {
+            const kyaLessons = [
+                {
+                    "_id": "testId",
+                    "title": "Actions you can take to reduce air pollution",
+                    "completion_message": "You just finished your first Know Your Air Lesson",
+                    "image": "https://testimage",
+                    "tasks": [
+                        {
+                            "_id": "testId",
+                            "title": "Use public transport",
+                            "content": "Vehicle exhaust is a major source of air pollution. Less cars on the road results in less emissions.",
+                            "image": "https://testimage",
+                            "task_position": 2
+                        },
+                    ]
+                }
+            ];
+            const targetLanguage = 'fr';
+
+            const expectedTranslations = [
+                {
+                    "_id": "testId",
+                    "title": "Mesures que vous pouvez prendre pour réduire la pollution de l’air",
+                    "completion_message": "Vous venez de terminer votre première leçon Know Your Air.",
+                    "image": "https://testimage",
+                    "tasks": [
+                        {
+                            "_id": "testId",
+                            "title": "Utilisez les transports en commun",
+                            "content": "Les gaz d’échappement des véhicules constituent une source majeure de pollution atmosphérique. Moins de voitures sur la route entraîne moins d’émissions.",
+                            "image": "https://testimage",
+                            "task_position": 2
+                        },
+                    ]
+                }
+            ];
+
+            const result = await translateUtil.translateLessons(kyaLessons, targetLanguage);
+
+
+            expect(result).to.have.property('success', true);
+            for (let i = 0; i < result.data.length; i++) {
+                expect(result.data[i].title).to.equal(expectedTranslations[i].title);
+                expect(result.data[i].completion_message).to.equal(expectedTranslations[i].completion_message);
+                expect(result.data[i].tasks).to.deep.equal(expectedTranslations[i].tasks);
+            }
+        }).timeout(10000);
+
+        it('should handle translation errors gracefully', async () => {
+
+            const lessons = null;
+            const targetLanguage = 'fr';
+            const result = await translateUtil.translateLessons(lessons, targetLanguage);
+
+            expect(result).to.have.property('success', false);
+            expect(result).to.have.property('message', 'Internal Server Error');
+            expect(result).to.have.property('status', 500);
+            expect(result).to.have.property('errors');
+            expect(result.errors).to.have.property('message');
+        });
+    });
+    describe("translateQuizzes", () => {
+        it('should translate Kya Quizzes to the target language', async () => {
+            const kyaQuizzes = [
+                {
+                    "_id": "testId",
+                    "title": "Get personalised air quality recommendations",
+                    "description": "Tell us more about Air Quality conditions in your environment & get personalised tips.",
+                    "completion_message": "Way to go🎊. You have unlocked personalised air quality recommendations to empower you on your clean air journey.",
+                    "image": "https//testImage",
+                    "questions": [
+                        {
+                            "title": "Where is your home environment situated?",
+                            "context": "Home environment",
+                            "question_position": 1,
+                            "answers": [
+                                {
+                                    "content": [
+                                        "Cooking with firewood can emit significant amounts of air pollutants.",
+                                        "Cook in a well-ventilated kitchen with good airflow or set up an outdoor kitchen if possible.",
+                                        "Use an efficient stove designed to burn firewood more cleanly and with less smoke.",
+                                        "Consider switching to improved cookstoves that reduce emissions and increase fuel efficiency."
+                                    ],
+                                    "title": "Firewood",
+                                }
+                            ]
+                        },
+                    ],
+                },
+            ];
+
+            const targetLanguage = 'fr';
+
+            const expectedTranslations = [
+                {
+                    "_id": "testId",
+                    "title": "Obtenez des recommandations personnalisées sur la qualité de l'air",
+                    "description": "Dites-nous en plus sur les conditions de qualité de l'air dans votre environnement et obtenez des conseils personnalisés.",
+                    "completion_message": "Bravo🎊. Vous avez débloqué des recommandations personnalisées sur la qualité de l'air pour vous aider dans votre voyage vers un air pur.",
+                    "image": "https//testImage",
+                    "questions": [
+                        {
+                            "title": "Où se situe votre environnement domestique ?",
+                            "context": "Environnement de la maison",
+                            "question_position": 1,
+                            "answers": [
+                                {
+                                    "content": [
+                                        "Cuisiner avec du bois de chauffage peut émettre des quantités importantes de polluants atmosphériques.",
+                                        "Cuisinez dans une cuisine bien ventilée avec une bonne circulation d’air ou installez une cuisine extérieure si possible.",
+                                        "Utilisez un poêle efficace conçu pour brûler du bois de chauffage plus proprement et avec moins de fumée.",
+                                        "Envisagez de passer à des cuisinières améliorées qui réduisent les émissions et augmentent le rendement énergétique."
+                                    ],
+                                    "title": "Bois de chauffage",
+                                }
+                            ]
+                        },
+                    ],
+                },
+            ];
+
+            const result = await translateUtil.translateQuizzes(kyaQuizzes, targetLanguage);
+
+
+            expect(result).to.have.property('success', true);
+            for (let i = 0; i < result.data.length; i++) {
+                expect(result.data[i].title).to.equal(expectedTranslations[i].title);
+                expect(result.data[i].completion_message).to.equal(expectedTranslations[i].completion_message);
+                expect(result.data[i].questions).to.deep.equal(expectedTranslations[i].questions);
+                expect(result.data[i].questions.answers).to.deep.equal(expectedTranslations[i].questions.answers);
+            }
+        }).timeout(10000);
+
+        it('should handle translation errors gracefully', async () => {
+
+            const kyaQuizzes = null;
+            const targetLanguage = 'fr';
+            const result = await translateUtil.translateQuizzes(kyaQuizzes, targetLanguage);
 
         expect(result).to.have.property('success', false);
         expect(result).to.have.property('message', 'Internal Server Error');
@@ -55,4 +205,5 @@ describe('translateUtil', () => {
         expect(result).to.have.property('errors');
         expect(result.errors).to.have.property('message');
     });
+});
 });
\ No newline at end of file
diff --git a/src/device-registry/utils/translate.js b/src/device-registry/utils/translate.js
index 9f885e78c5..eba61b275e 100644
--- a/src/device-registry/utils/translate.js
+++ b/src/device-registry/utils/translate.js
@@ -9,7 +9,7 @@ const { Translate } = require('@google-cloud/translate').v2;
 const translate = new Translate();
 
 const translateUtil = {
-    translate: async (healthTips, targetLanguage) => {
+    translateTips: async (healthTips, targetLanguage) => {
         try {
             const translatedHealthTips = [];
 
@@ -39,6 +39,101 @@ const translateUtil = {
             };
         }
     },
+
+    translateLessons: async (lessons, targetLanguage) => {
+        try {
+            const translatedLessons = [];
+
+            for (const lesson of lessons) {
+                const translatedLesson = { ...lesson };
+                translatedLesson.title = await translateText(lesson.title, targetLanguage);
+                translatedLesson.completion_message = await translateText(lesson.completion_message, targetLanguage);
+                const translatedTasks = [];
+                for (const task of lesson.tasks) {
+                    const translatedTask = { ...task };
+                    translatedTask.title = await translateText(task.title, targetLanguage);
+                    translatedTask.content = await translateText(task.content, targetLanguage);
+                    translatedTasks.push(translatedTask);
+                }
+                translatedLesson.tasks = translatedTasks
+                translatedLessons.push(translatedLesson);
+            }
+
+            return {
+                success: true,
+                message: "Translated KYA returned Successfully",
+                data: translatedLessons,
+                status: httpStatus.OK,
+            };
+        } catch (error) {
+            logger.error(`internal server error -- ${error.message}`);
+            console.log(`internal server error -- ${error.message}`);
+
+            return {
+                success: false,
+                message: "Internal Server Error",
+                status: httpStatus.INTERNAL_SERVER_ERROR,
+                errors: {
+                    message: error.message,
+                },
+            };
+        }
+    },
+
+    translateQuizzes: async (quizzes, targetLanguage) => {
+        try {
+            const translatedQuizzes = [];
+
+            for (const quiz of quizzes) {
+                const translatedQuiz = { ...quiz };
+                translatedQuiz.title = await translateText(quiz.title, targetLanguage);
+                translatedQuiz.description = await translateText(quiz.description, targetLanguage);
+                translatedQuiz.completion_message = await translateText(quiz.completion_message, targetLanguage);
+                const translatedQuestions = [];
+                for (const question of quiz.questions) {
+                    const translatedQuestion = { ...question };
+                    translatedQuestion.title = await translateText(question.title, targetLanguage);
+                    translatedQuestion.context = await translateText(question.context, targetLanguage);
+                    const translatedAnswers = [];
+                    for (const answer of question.answers) {
+                        const translatedAnswer = { ...answer };
+                        translatedAnswer.title = await translateText(answer.title, targetLanguage);
+                        const translatedContent = [];
+                        for (const contentItem of answer.content) {
+                            const translatedItem = await translateText(contentItem, targetLanguage);
+                            translatedContent.push(translatedItem);
+                        }
+                        translatedAnswer.content = translatedContent;
+
+                        translatedAnswers.push(translatedAnswer);
+                    }
+                    translatedQuestion.answers = translatedAnswers;
+                    translatedQuestions.push(translatedQuestion);
+                }
+                translatedQuiz.questions = translatedQuestions
+                translatedQuizzes.push(translatedQuiz);
+            }
+
+            return {
+                success: true,
+                message: "Translated KYA returned Successfully",
+                data: translatedQuizzes,
+                status: httpStatus.OK,
+            };
+        } catch (error) {
+            logger.error(`internal server error -- ${error.message}`);
+            return {
+                success: false,
+                message: "Internal Server Error",
+                status: httpStatus.INTERNAL_SERVER_ERROR,
+                errors: {
+                    message: error.message,
+                },
+            };
+        }
+    },
+
+
 };
 
 async function translateText(text, target) {

From 69e96c86e7dd6f200044304054674d42d2090ce7 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 19 Sep 2023 16:20:45 +0300
Subject: [PATCH 36/43] Update device registry staging image tag to
 stage-33cbc445-1695129549

---
 k8s/device-registry/values-stage.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k8s/device-registry/values-stage.yaml b/k8s/device-registry/values-stage.yaml
index a6e50f65cd..3af90282ae 100644
--- a/k8s/device-registry/values-stage.yaml
+++ b/k8s/device-registry/values-stage.yaml
@@ -6,7 +6,7 @@ app:
 replicaCount: 2
 image:
   repository: eu.gcr.io/airqo-250220/airqo-stage-device-registry-api
-  tag: stage-5e65174e-1695028544
+  tag: stage-33cbc445-1695129549
 nameOverride: ''
 fullnameOverride: ''
 podAnnotations: {}

From fba683db9e069b9d513e6fb743d3d0bd99366e53 Mon Sep 17 00:00:00 2001
From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com>
Date: Wed, 20 Sep 2023 03:43:51 +0300
Subject: [PATCH 37/43] Fix save_to_mongo method

---
 src/airflow/airqo_etl_utils/config.py         |  4 ++
 src/airflow/airqo_etl_utils/ml_utils.py       | 49 ++++++++++++-------
 src/airflow/airqo_etl_utils/tests/conftest.py | 43 ++++++++++++++--
 .../airqo_etl_utils/tests/ml_utils_tests.py   | 23 +++++++++
 src/airflow/dags/ml_prediction_jobs.py        |  5 +-
 5 files changed, 98 insertions(+), 26 deletions(-)

diff --git a/src/airflow/airqo_etl_utils/config.py b/src/airflow/airqo_etl_utils/config.py
index 9176da5f2e..3f2768bf8a 100644
--- a/src/airflow/airqo_etl_utils/config.py
+++ b/src/airflow/airqo_etl_utils/config.py
@@ -1,6 +1,7 @@
 import os
 from pathlib import Path
 
+import pymongo as pm
 import urllib3
 from dotenv import load_dotenv
 
@@ -174,3 +175,6 @@ class Config:
 
 
 configuration = Config()
+
+client = pm.MongoClient(configuration.MONGO_URI)
+db = client[configuration.MONGO_DATABASE_NAME]
diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py
index 811e267515..53bf7af901 100644
--- a/src/airflow/airqo_etl_utils/ml_utils.py
+++ b/src/airflow/airqo_etl_utils/ml_utils.py
@@ -8,11 +8,10 @@
 import numpy as np
 import optuna
 import pandas as pd
-import pymongo as pm
 from lightgbm import LGBMRegressor, early_stopping
 from sklearn.metrics import mean_squared_error
 
-from .config import configuration
+from .config import configuration, db
 
 project_id = configuration.GOOGLE_CLOUD_PROJECT_ID
 bucket = configuration.FORECAST_MODELS_BUCKET
@@ -493,6 +492,7 @@ def objective(trial):
 
     @staticmethod
     def generate_forecasts(data, project_name, bucket_name, frequency):
+        data = data.dropna(subset=["device_id"])
         data["timestamp"] = pd.to_datetime(data["timestamp"])
         data.columns = data.columns.str.strip()
         # data["margin_of_error"] = data["adjusted_forecast"] = 0
@@ -641,25 +641,36 @@ def get_forecasts(
     def save_forecasts_to_mongo(data, frequency):
         device_ids = data["device_id"].unique()
         created_at = pd.to_datetime(datetime.now()).isoformat()
-        forecast_results = [
-            {
-                field: data[data["device_id"] == i][field].tolist()[0]
-                if field not in ["pm2_5", "timestamp"]
-                else data[data["device_id"] == i][field].tolist()
-                for field in data.columns
+
+        forecast_results = []
+        for i in device_ids:
+            doc = {
+                "device_id": i,
+                "created_at": created_at,
+                "pm2_5": data[data["device_id"] == i]["pm2_5"].tolist(),
+                "timestamp": data[data["device_id"] == i]["timestamp"].tolist(),
             }
-            | {"created_at": created_at}
-            for i in device_ids
-        ]
-        client = pm.MongoClient(configuration.MONGO_URI)
-        db = client[configuration.MONGO_DATABASE_NAME]
+            forecast_results.append(doc)
+
         if frequency == "hourly":
-            db.hourly_forecasts.delete_many({})
-            db.hourly_forecasts.insert_many(forecast_results)
-            print(db.hourly_forecasts.find_one())  # confirm saving has worked
+            collection = db.hourly_forecasts
         elif frequency == "daily":
-            db.daily_forecasts.delete_many({})
-            db.daily_forecasts.insert_many(forecast_results)
-            print(db.daily_forecasts_1.find_one())
+            collection = db.daily_forecasts
         else:
             raise ValueError("Invalid frequency argument")
+
+        for doc in forecast_results:
+            try:
+                filter_query = {"device_id": doc["device_id"]}
+                update_query = {
+                    "$set": {
+                        "pm2_5": doc["pm2_5"],
+                        "timestamp": doc["timestamp"],
+                        "created_at": doc["created_at"],
+                    }
+                }
+                collection.update_one(filter_query, update_query, upsert=True)
+            except Exception as e:
+                print(
+                    f"Failed to update forecast for device {doc['device_id']}: {str(e)}"
+                )
diff --git a/src/airflow/airqo_etl_utils/tests/conftest.py b/src/airflow/airqo_etl_utils/tests/conftest.py
index cdbc784dc7..cfb5b15bb5 100644
--- a/src/airflow/airqo_etl_utils/tests/conftest.py
+++ b/src/airflow/airqo_etl_utils/tests/conftest.py
@@ -1,9 +1,12 @@
 from datetime import datetime
+from unittest.mock import MagicMock
 
 import numpy as np
 import pandas as pd
 import pytest
 
+from airqo_etl_utils.config import configuration
+
 
 def pytest_configure(config):
     config.addinivalue_line(
@@ -59,12 +62,44 @@ def sample_dataframe_for_location_features():
         }
         return pd.DataFrame(data)
 
+    @staticmethod
+    @pytest.fixture
+    def sample_hourly_forecast_data():
+        return pd.DataFrame(
+            {
+                "device_id": ["dev1", "dev1", "dev2"],
+                "pm2_5": [10, 15, 20],
+                "timestamp": [
+                    datetime(2023, 1, 1, 0),
+                    datetime(2023, 1, 1, 1),
+                    datetime(2023, 1, 1, 2),
+                ],
+            }
+        )
 
-@pytest.fixture(scope="session")
-def mongo_fixture():
-    from airqo_etl_utils.mongo_client import MongoClient
+    @staticmethod
+    @pytest.fixture
+    def sample_daily_forecast_data():
+        return pd.DataFrame(
+            {
+                "device_id": ["dev1", "dev1", "dev2"],
+                "pm2_5": [10, 15, 20],
+                "timestamp": [
+                    datetime(2023, 1, 1),
+                    datetime(2023, 1, 2),
+                    datetime(2023, 1, 3),
+                ],
+            }
+        )
 
-    return MongoClient(uri="mongodb://localhost:27017", db_name="test_db")
+    @staticmethod
+    @pytest.fixture
+    def mock_db():
+        mock_client = MagicMock()
+        mock_db = mock_client[configuration.MONGO_DATABASE_NAME]
+        mock_db.hourly_forecasts = MagicMock()
+        mock_db.daily_forecasts = MagicMock()
+        return mock_db
 
 
 class FaultDetectionFixtures:
diff --git a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py
index f07f58c908..d03b02d7bc 100644
--- a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py
+++ b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py
@@ -134,3 +134,26 @@ def test_get_location_features(self, sample_dataframe_for_location_features):
         df = FUtils.get_location_features(sample_dataframe_for_location_features)
         for cord in ["x_cord", "y_cord", "z_cord"]:
             assert cord in df.columns
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize(
+        "frequency,collection_name",
+        [
+            ("hourly", "hourly_forecasts"),
+            ("daily", "daily_forecasts"),
+            # ("invalid", None),
+        ],
+    )
+    def test_save_forecasts_to_mongo_frequency(
+        self, mock_db, frequency, collection_name, sample_dataframe_db
+    ):
+        if frequency == "invalid":
+            # Expect a ValueError for an invalid frequency
+            with pytest.raises(ValueError) as e:
+                FUtils.save_forecasts_to_mongo(sample_dataframe_db, frequency)
+            assert str(e.value) == f"Invalid frequency argument: {frequency}"
+        else:
+            # Expect no exception for a valid frequency
+            FUtils.save_forecasts_to_mongo(sample_dataframe_db, frequency)
+            mock_collection = getattr(mock_db, collection_name)
+            assert mock_collection.update_one.call_count == 0
diff --git a/src/airflow/dags/ml_prediction_jobs.py b/src/airflow/dags/ml_prediction_jobs.py
index e602a0e3c1..f68a8d4c8d 100644
--- a/src/airflow/dags/ml_prediction_jobs.py
+++ b/src/airflow/dags/ml_prediction_jobs.py
@@ -96,6 +96,7 @@ def get_location_features_daily_forecast(data):
     @task()
     def encode_daily_categorical_features(data):
         return DecodingUtils.decode_categorical_features_pred(data, "daily")
+
     @task()
     def make_daily_forecasts(data):
         return ForecastUtils.generate_forecasts(
@@ -143,9 +144,7 @@ def save_daily_forecasts_to_mongo(data):
     daily_location_features = get_location_features_daily_forecast(
         daily_time_and_cyclic_features
     )
-    daily_encoded_features = encode_daily_categorical_features(
-        daily_location_features
-    )
+    daily_encoded_features = encode_daily_categorical_features(daily_location_features)
     daily_forecasts = make_daily_forecasts(daily_encoded_features)
     save_daily_forecasts_to_bigquery(daily_forecasts)
     save_daily_forecasts_to_mongo(daily_forecasts)

From 4205c1742cfdfc1f6957a87485901a3b273ac675 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 20 Sep 2023 09:07:15 +0300
Subject: [PATCH 38/43] Update airflow staging image tag to
 stage-586026aa-1695189812

---
 k8s/airflow/values-stage.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k8s/airflow/values-stage.yaml b/k8s/airflow/values-stage.yaml
index 0f8ec9a229..2cfb0ff91e 100644
--- a/k8s/airflow/values-stage.yaml
+++ b/k8s/airflow/values-stage.yaml
@@ -9,7 +9,7 @@ images:
   repositories:
     initContainer: eu.gcr.io/airqo-250220/airqo-stage-apache-airflow-xcom
     containers: eu.gcr.io/airqo-250220/airqo-stage-apache-airflow
-  tag: stage-defae719-1695039035
+  tag: stage-586026aa-1695189812
 nameOverride: ''
 fullnameOverride: ''
 podAnnotations: {}

From 027c8570c3d973df2de80919752a0631a0a15da5 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 20 Sep 2023 12:08:03 +0300
Subject: [PATCH 39/43] Update airflow staging image tag to
 stage-586026aa-1695189812


From 69be96b461c9c340c9a7468f65727f19a0e885b2 Mon Sep 17 00:00:00 2001
From: Benjamin Ssempala <86492979+BenjaminSsempala@users.noreply.github.com>
Date: Thu, 21 Sep 2023 04:35:03 +0300
Subject: [PATCH 40/43] Add tests for data-mgt

---
 src/data-mgt/node/utils/log.js            |  2 +-
 src/data-mgt/node/utils/test/ut_date.js   | 97 ++++++++++++++++++++++-
 src/data-mgt/node/utils/test/ut_errors.js | 84 +++++++++++++++++++-
 src/data-mgt/node/utils/test/ut_log.js    | 68 +++++++++++++++-
 4 files changed, 246 insertions(+), 5 deletions(-)

diff --git a/src/data-mgt/node/utils/log.js b/src/data-mgt/node/utils/log.js
index 8814967bcd..9a78b39f6e 100644
--- a/src/data-mgt/node/utils/log.js
+++ b/src/data-mgt/node/utils/log.js
@@ -29,7 +29,7 @@ const logError = (error) => {
   // console.error(e);
   if (process.env.NODE_ENV !== "production") {
     console.log("an unhandled promise rejection" + ": ");
-    console.error(e);
+    console.error(error);
   }
   return "log deactivated in prod and stage";
 };
diff --git a/src/data-mgt/node/utils/test/ut_date.js b/src/data-mgt/node/utils/test/ut_date.js
index e75c7e3996..42b457b961 100644
--- a/src/data-mgt/node/utils/test/ut_date.js
+++ b/src/data-mgt/node/utils/test/ut_date.js
@@ -1 +1,96 @@
-require("module-alias/register");
+const { expect } = require('chai');
+const DateUtil = require('../date');
+
+describe('Date Util', () => {
+    describe('generateDateFormat', () => {
+        it('should return a formatted date string with hours', async () => {
+            const ISODate = '2023-09-21T12:34:56Z';
+            const result = await DateUtil.generateDateFormat(ISODate);
+            expect(result).to.equal('2023-09-21-12');
+        });
+    });
+
+    describe('isTimeEmpty', () => {
+        it('should return false for a valid time', () => {
+            const dateTime = '2023-09-21T12:34:56Z';
+            const result = DateUtil.isTimeEmpty(dateTime);
+            expect(result).to.be.false;
+        });
+
+        it('should return true for an empty time', () => {
+            const dateTime = '2023-09-21';
+            const result = DateUtil.isTimeEmpty(dateTime);
+            expect(result).to.be.true;
+        });
+    });
+
+    describe('generateDateFormatWithoutHrs', () => {
+        it('should return a formatted date string without hours', () => {
+            const ISODate = '2023-09-21T12:34:56Z';
+            const result = DateUtil.generateDateFormatWithoutHrs(ISODate);
+            expect(result).to.equal('2023-09-21');
+        });
+    });
+
+    describe('isDate', () => {
+        it('should return true for date strings with "-" or "/"', () => {
+            expect(DateUtil.isDate('2023-09-21')).to.be.true;
+            expect(DateUtil.isDate('09/21/2023')).to.be.true;
+        });
+
+        it('should return false for non-date strings', () => {
+            expect(DateUtil.isDate('2023')).to.be.false;
+            expect(DateUtil.isDate('Hello, World!')).to.be.false;
+        });
+    });
+
+    describe('addMonthsToProvideDateTime', () => {
+        it('should add months to a provided date/time', () => {
+            const dateTime = '2023-09-21T12:34:56Z';
+            const number = 3;
+            const result = DateUtil.addMonthsToProvideDateTime(dateTime, number);
+            expect(result).to.be.a('Date');
+        });
+
+        it('should handle empty time and add months to date', () => {
+            const date = '2023-09-21';
+            const number = 3;
+            const result = DateUtil.addMonthsToProvideDateTime(date, number);
+            expect(result).to.be.a('Date');
+        });
+    });
+
+    describe('monthsInfront', () => {
+        it('should return a date in the future with the given number of months', () => {
+            const number = 3;
+            const result = DateUtil.monthsInfront(number);
+            expect(result).to.be.a('Date');
+        });
+    });
+
+    describe('addDays', () => {
+        it('should add days to the current date', () => {
+            const number = 7;
+            const result = DateUtil.addDays(number);
+            expect(result).to.be.a('Date');
+        });
+    });
+
+    describe('getDifferenceInMonths', () => {
+        it('should calculate the difference in months between two dates', () => {
+            const date1 = '2023-09-21';
+            const date2 = '2024-01-15';
+            const result = DateUtil.getDifferenceInMonths(date1, date2);
+            expect(result).to.equal(4);
+        });
+    });
+
+    describe('threeMonthsFromNow', () => {
+        it('should return a date three months from the provided date', () => {
+            const date = '2023-09-21';
+            const result = DateUtil.threeMonthsFromNow(date);
+            expect(result).to.be.a('Date');
+        });
+    });
+
+});
diff --git a/src/data-mgt/node/utils/test/ut_errors.js b/src/data-mgt/node/utils/test/ut_errors.js
index e75c7e3996..6c13a790d5 100644
--- a/src/data-mgt/node/utils/test/ut_errors.js
+++ b/src/data-mgt/node/utils/test/ut_errors.js
@@ -1 +1,83 @@
-require("module-alias/register");
+const chai = require("chai");
+const { expect } = chai;
+const sinon = require("sinon");
+const HTTPStatus = require("http-status");
+const errors = require("../errors");
+
+describe("Errors Utility Functions", () => {
+    describe("convertErrorArrayToObject", () => {
+        it("should convert an array of errors to an object", () => {
+            const errorArray = [
+                { param: "field1", msg: "Field 1 is required" },
+                { param: "field2", msg: "Field 2 must be a number" },
+            ];
+
+            const result = errors.convertErrorArrayToObject(errorArray);
+
+            expect(result).to.deep.equal({
+                field1: "Field 1 is required",
+                field2: "Field 2 must be a number",
+            });
+        });
+    });
+
+    describe("errorResponse", () => {
+        it("should send an error response with default status code", () => {
+            const res = {
+                status: sinon.stub().returnsThis(),
+                json: sinon.spy(),
+            };
+
+            errors.errorResponse({ res, message: "An error occurred" });
+
+            expect(res.status.calledWith(HTTPStatus.INTERNAL_SERVER_ERROR)).to.be.true;
+            expect(res.json.calledWithMatch({
+                success: false,
+                message: "An error occurred",
+                error: {
+                    statusCode: HTTPStatus.INTERNAL_SERVER_ERROR,
+                    message: "An error occurred",
+                    error: {},
+                },
+            })).to.be.true;
+        });
+
+        it("should send an error response with a custom status code", () => {
+            const res = {
+                status: sinon.stub().returnsThis(),
+                json: sinon.spy(),
+            };
+
+            errors.errorResponse({ res, message: "Bad request", statusCode: HTTPStatus.BAD_REQUEST });
+
+            expect(res.status.calledWith(HTTPStatus.BAD_REQUEST)).to.be.true;
+            expect(res.json.calledWithMatch({
+                success: false,
+                message: "Bad request",
+                error: {
+                    statusCode: HTTPStatus.BAD_REQUEST,
+                    message: "Bad request",
+                    error: {},
+                },
+            })).to.be.true;
+        });
+    });
+
+    describe("badRequest", () => {
+        it("should send a bad request response", () => {
+            const res = {
+                status: sinon.stub().returnsThis(),
+                json: sinon.spy(),
+            };
+
+            errors.badRequest(res, "Bad request", { field: "Invalid input" });
+
+            expect(res.status.calledWith(HTTPStatus.BAD_REQUEST)).to.be.true;
+            expect(res.json.calledWithMatch({
+                success: false,
+                message: "Bad request",
+                errors: { field: "Invalid input" },
+            })).to.be.true;
+        });
+    });
+});
diff --git a/src/data-mgt/node/utils/test/ut_log.js b/src/data-mgt/node/utils/test/ut_log.js
index 74b1a1f804..e21e7d11a8 100644
--- a/src/data-mgt/node/utils/test/ut_log.js
+++ b/src/data-mgt/node/utils/test/ut_log.js
@@ -1,2 +1,66 @@
-require("module-alias/register");
-s;
+const chai = require("chai");
+const { expect } = chai;
+const sinon = require("sinon");
+const { logText, logElement, logObject, logError } = require("../log");
+
+describe("Logging Utility Functions", () => {
+    describe("logText", () => {
+        it("should log a message when not in production", () => {
+            const consoleLogStub = sinon.stub(console, "log");
+            process.env.NODE_ENV = "development";
+            const result = logText("Test log message");
+            expect(consoleLogStub.calledOnce).to.be.true;
+            expect(consoleLogStub.calledWith("Test log message")).to.be.true;
+            consoleLogStub.restore();
+            process.env.NODE_ENV = "test";
+        });
+
+        it("should return a log deactivation message in production", () => {
+            const consoleLogStub = sinon.stub(console, "log");
+            process.env.NODE_ENV = "production";
+            const result = logText("Test log message");
+            expect(consoleLogStub.notCalled).to.be.true;
+            expect(result).to.equal("log deactivated in prod and stage");
+            consoleLogStub.restore();
+            process.env.NODE_ENV = "test";
+        });
+    });
+
+    describe("logElement", () => {
+        it("should log an element when not in production", () => {
+            const consoleLogStub = sinon.stub(console, "log");
+            process.env.NODE_ENV = "development";
+            const result = logElement("Test", "Element");
+            expect(consoleLogStub.calledOnce).to.be.true;
+            expect(consoleLogStub.calledWith("Test: Element")).to.be.true;
+            consoleLogStub.restore();
+            process.env.NODE_ENV = "test";
+        });
+    });
+
+    describe("logObject", () => {
+        it("should log an object when not in production", () => {
+            const consoleLogStub = sinon.stub(console, "log");
+            process.env.NODE_ENV = "development";
+            const result = logObject("Test", { key: "value" });
+            expect(consoleLogStub.calledOnce).to.be.true;
+            expect(consoleLogStub.calledWith("Test: ")).to.be.true;
+            consoleLogStub.restore();
+            process.env.NODE_ENV = "test";
+        });
+    });
+
+    describe("logError", () => {
+        it("should log an error when not in production", () => {
+            const consoleErrorStub = sinon.stub(console, "error");
+            process.env.NODE_ENV = "development";
+            const error = new Error("Test error message");
+            const result = logError(error);
+            expect(consoleErrorStub.calledOnce).to.be.true;
+            expect(consoleErrorStub.calledWith(error)).to.be.true;
+            consoleErrorStub.restore();
+            process.env.NODE_ENV = "test";
+        });
+
+    });
+});

From 676e80f0ed7c15dab29fa0100e80e5dd06fe2303 Mon Sep 17 00:00:00 2001
From: Benjamin Ssempala <86492979+BenjaminSsempala@users.noreply.github.com>
Date: Thu, 21 Sep 2023 04:44:49 +0300
Subject: [PATCH 41/43] Fixing errors for incentives tests

---
 src/incentives/utils/test/ut_create-transaction.js | 14 +++++++-------
 src/incentives/utils/test/ut_generate-filter.js    |  1 +
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/incentives/utils/test/ut_create-transaction.js b/src/incentives/utils/test/ut_create-transaction.js
index ee9c90f403..25bd1e4a6b 100644
--- a/src/incentives/utils/test/ut_create-transaction.js
+++ b/src/incentives/utils/test/ut_create-transaction.js
@@ -4,7 +4,7 @@ const chai = require("chai");
 const { expect } = chai;
 const httpStatus = require("http-status");
 
-const TransactionModel = require("@models/Transaction");
+const TransactionModel = require("@models/transaction");
 const createTransaction = require("@utils/create-transaction");
 
 const axios = require("axios");
@@ -509,7 +509,7 @@ describe("createTransaction", () => {
       });
 
       // Execute the function
-      const response = await getTransactionDetails(request);
+      const response = await createTransaction.getTransactionDetails(request);
 
       // Assert the response
       expect(response).to.deep.equal(expectedResponse);
@@ -537,7 +537,7 @@ describe("createTransaction", () => {
         .rejects(new Error("Network Error"));
 
       // Execute the function
-      const response = await getTransactionDetails(request);
+      const response = await createTransaction.getTransactionDetails(request);
 
       // Assert the response
       expect(response).to.deep.equal({
@@ -601,7 +601,7 @@ describe("createTransaction", () => {
       });
 
       // Execute the function
-      const response = await loadDataBundle(request);
+      const response = await createTransaction.loadDataBundle(request);
 
       // Assert the response
       expect(response).to.deep.equal(expectedResponse);
@@ -660,7 +660,7 @@ describe("createTransaction", () => {
         .rejects(new Error("Network Error"));
 
       // Execute the function
-      const response = await loadDataBundle(request);
+      const response = await createTransaction.loadDataBundle(request);
 
       // Assert the response
       expect(response).to.deep.equal({
@@ -721,7 +721,7 @@ describe("createTransaction", () => {
       };
 
       // Execute the function
-      const response = await checkRemainingDataBundleBalance(request);
+      const response = await createTransaction.checkRemainingDataBundleBalance(request);
 
       // Assert the response
       expect(response).to.deep.equal(expectedResponse);
@@ -745,7 +745,7 @@ describe("createTransaction", () => {
       const throwStub = chai.spy.on(errorStub, "throw");
 
       // Execute the function
-      const response = await checkRemainingDataBundleBalance(request);
+      const response = await createTransaction.checkRemainingDataBundleBalance(request);
 
       // Assert the response
       expect(response).to.deep.equal({
diff --git a/src/incentives/utils/test/ut_generate-filter.js b/src/incentives/utils/test/ut_generate-filter.js
index 3e0da9e6d5..871f37bb43 100644
--- a/src/incentives/utils/test/ut_generate-filter.js
+++ b/src/incentives/utils/test/ut_generate-filter.js
@@ -5,6 +5,7 @@ const { expect } = chai;
 const generateFilter = require("@utils/generate-filter");
 const mongoose = require("mongoose");
 const ObjectId = mongoose.Types.ObjectId;
+const httpStatus = require("http-status");
 
 describe("generateFilter", () => {
   describe("hosts", () => {

From 2ed3a86ba00e7db93cb5c2eb00de25bf913af8c1 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 21 Sep 2023 09:56:00 +0300
Subject: [PATCH 42/43] Update data mgt staging image tag to
 stage-d808bb92-1695279279

---
 k8s/data-mgt/values-stage.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k8s/data-mgt/values-stage.yaml b/k8s/data-mgt/values-stage.yaml
index 7b2e05ff8a..b700dd54ec 100644
--- a/k8s/data-mgt/values-stage.yaml
+++ b/k8s/data-mgt/values-stage.yaml
@@ -6,7 +6,7 @@ app:
 replicaCount: 2
 image:
   repository: eu.gcr.io/airqo-250220/airqo-stage-data-mgt-api
-  tag: stage-e2c1d558-1691937865
+  tag: stage-d808bb92-1695279279
 nameOverride: ''
 fullnameOverride: ''
 podAnnotations: {}

From c7fb17a053013a039eafde409e3421278c3adb55 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 21 Sep 2023 09:56:15 +0300
Subject: [PATCH 43/43] Update incentives staging image tag to
 stage-d808bb92-1695279279

---
 k8s/incentives/values-stage.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k8s/incentives/values-stage.yaml b/k8s/incentives/values-stage.yaml
index 004bc0de37..6f124f67d1 100644
--- a/k8s/incentives/values-stage.yaml
+++ b/k8s/incentives/values-stage.yaml
@@ -6,7 +6,7 @@ app:
 replicaCount: 2
 image:
   repository: eu.gcr.io/airqo-250220/airqo-stage-incentives-api
-  tag: stage-f7ce8287-1693130445
+  tag: stage-d808bb92-1695279279
 nameOverride: ''
 fullnameOverride: ''
 podAnnotations: {}