From b95f99dd8c4ec13815befb6c344a0c1bbfef4d20 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Fri, 18 Aug 2023 10:01:19 +0300 Subject: [PATCH 01/43] setup preprocessing methods --- src/airflow/airflow-requirements.txt | 3 +- src/airflow/airqo_etl_utils/bigquery_api.py | 25 +++++++- src/airflow/airqo_etl_utils/ml_utils.py | 67 ++++++++++++++++----- src/airflow/dev-requirements.txt | 3 +- src/airflow/requirements.txt | 3 +- 5 files changed, 81 insertions(+), 20 deletions(-) diff --git a/src/airflow/airflow-requirements.txt b/src/airflow/airflow-requirements.txt index 977410828f..48af8ae3aa 100644 --- a/src/airflow/airflow-requirements.txt +++ b/src/airflow/airflow-requirements.txt @@ -6,4 +6,5 @@ apache-airflow[sentry] lightgbm mlflow gcsfs -pymongo \ No newline at end of file +pymongo +category-encoders \ No newline at end of file diff --git a/src/airflow/airqo_etl_utils/bigquery_api.py b/src/airflow/airqo_etl_utils/bigquery_api.py index 4a2ed54181..879f25db5b 100644 --- a/src/airflow/airqo_etl_utils/bigquery_api.py +++ b/src/airflow/airqo_etl_utils/bigquery_api.py @@ -619,7 +619,7 @@ def fetch_data(self, start_date_time: str, historical: bool = False): # historical is for the actual jobs, not training query = f""" SELECT DISTINCT timestamp as created_at, {"site_id," if historical else ""} device_number, pm2_5_calibrated_value as pm2_5 - FROM `{configuration.BIGQUERY_HOURLY_EVENTS_TABLE_PROD}` + FROM `{self.hourly_measurements_table_prod}` WHERE DATE(timestamp) >= '{start_date_time}' and device_number IS NOT NULL ORDER BY created_at, device_number """ @@ -630,6 +630,29 @@ def fetch_data(self, start_date_time: str, historical: bool = False): df = self.client.query(f"{query}", job_config).result().to_dataframe() return df + def \ + fetch_training_data(self, start_date_time:str,) -> pd.DataFrame: + query = f""" + SELECT DISTINCT + t1.device_id, + t1.timestamp, + t1.site_id, + t1.pm2_5_calibrated_value, + t2.latitude, + t2.longitude, + t3.device_category + FROM `{self.hourly_measurements_table_prod}` t1 + JOIN `{self.sites_table}` t2 on t1.site_id = t2.id + JOIN `{self.devices_table}` t3 on t1.device_id = t3.device_id + WHERE date(t1.timestamp) >= '{start_date_time}' and t1.device_id IS NOT NULL + ORDER BY device_id, timestamp""" + + job_config = bigquery.QueryJobConfig() + job_config.use_query_cache = True + + df = self.client.query(f"{query}", job_config).result().to_dataframe() + return df + @staticmethod def save_forecasts_to_bigquery(df, table): """saves the dataframes to the bigquery tables""" diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index 8a7adb8e5a..380e8e58a3 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd import pymongo as pm +from category_encoders import OneHotEncoder, CountEncoder from lightgbm import LGBMRegressor, early_stopping from scipy.stats import skew from sklearn.metrics import mean_squared_error @@ -49,38 +50,36 @@ class ForecastUtils: ###FORECAST MODEL TRAINING UTILS#### @staticmethod def preprocess_training_data(data, frequency): - data["created_at"] = pd.to_datetime(data["created_at"]) - data["device_number"] = data["device_number"].astype(str) - data["pm2_5"] = data.groupby("device_number")["pm2_5"].transform( + data["timestamp"] = pd.to_datetime(data["timestamp"]) + data["pm2_5"] = data.groupby("device_id")["pm2_5"].transform( lambda x: x.interpolate(method="linear", limit_direction="both") ) if frequency == "daily": data = ( - data.groupby(["device_number"]) - .resample("D", on="created_at") + data.groupby(["device_id"]) + .resample("D", on="timestamp") .mean(numeric_only=True) ) data.reset_index(inplace=True) - data["pm2_5"] = data.groupby("device_number")["pm2_5"].transform( + data["pm2_5"] = data.groupby("device_id")["pm2_5"].transform( lambda x: x.interpolate(method="linear", limit_direction="both") ) - data["device_number"] = data["device_number"].astype(int) data = data.dropna(subset=["pm2_5"]) return data @staticmethod - def feature_eng_training_data(data, target_column, frequency): + def feature_eng_training_data(data, target_column, frequency): def get_lag_features(df, target_col, freq): - df = df.sort_values(by=["device_number", "created_at"]) + df = df.sort_values(by=["device_id", "timestamp"]) if freq == "daily": - shifts = [1, 2] + shifts = [1, 2, 3, 7, 14] for s in shifts: - df[f"pm2_5_last_{s}_day"] = df.groupby(["device_number"])[ + df[f"pm2_5_last_{s}_day"] = df.groupby(["device_id"])[ target_col ].shift(s) - shifts = [3, 7, 14, 30] + shifts = [2, 3, 7, 14] functions = ["mean", "std", "max", "min"] for s in shifts: for f in functions: @@ -127,10 +126,46 @@ def get_other_features(df_tmp, freq): print("Additional features added") return df_tmp + def encode_categorical_features(df_tmp): + #use count_encoding on site_id & device_id,also save the real values & what they've been encoded to in a dictionary - data["created_at"] = pd.to_datetime(data["created_at"]) + #encode site_id + site_id_encoder = CountEncoder() + site_id_encoder.fit(df_tmp['site_id']) + df_tmp['site_id'] = site_id_encoder.transform(df_tmp['site_id']) + # site_id_encoder_dict = site_id_encoder.mapping[0]['mapping'] + + #encode device_id + device_id_encoder = CountEncoder() + device_id_encoder.fit(df_tmp['device_id']) + df_tmp['device_id'] = device_id_encoder.transform(df_tmp['device_id']) + # device_id_encoder_dict = device_id_encoder.mapping[0]['mapping'] + + device_category_encoder = OneHotEncoder(cols=['device_category']) + df_tmp = device_category_encoder.fit_transform(df_tmp) + + return df_tmp + + def get_time_and_cyclic_features(df): + attributes = ["year", "month", "day", "dayofweek", "hour"] + max_vals = [2023, 12, 31, 6, 23] + for a, m in zip(attributes, max_vals): + df[a] = df["timestamp"].dt.__getattribute__(a) + df[a + "_sin"] = np.sin(2 * np.pi * df[a] / m) + df[a + "_cos"] = np.cos(2 * np.pi * df[a] / m) + + df["week"] = df["timestamp"].dt.isocalendar().week + df["week_sin"] = np.sin(2 * np.pi * df["week"] / 52) + df["week_cos"] = np.cos(2 * np.pi * df["week"] / 52) + df.drop(columns=attributes, inplace=True) + return df + + + data["timestamp"] = pd.to_datetime(data["timestamp"]) df_tmp = get_other_features(data, frequency) df_tmp = get_lag_features(df_tmp, target_column, frequency) + df_tmp = encode_categorical_features(df_tmp) + df_tmp = get_time_and_cyclic_features(df_tmp) return df_tmp @@ -234,9 +269,9 @@ def train_and_save_hourly_forecast_model(train): # separate code for hourly mod @staticmethod def train_and_save_daily_forecast_model(train): # separate code for monthly model - train["created_at"] = pd.to_datetime(train["created_at"]) - train = train.sort_values(by=["device_number", "created_at"]) - features = [c for c in train.columns if c not in ["created_at", "pm2_5"]] + train["timestamp"] = pd.to_datetime(train["timestamp"]) + train = train.sort_values(by=['device_id', 'timestamp']) + features = [c for c in train.columns if c not in ["timestamp", "pm2_5"]] print(features) target_col = "pm2_5" train_data, test_data = pd.DataFrame(), pd.DataFrame() diff --git a/src/airflow/dev-requirements.txt b/src/airflow/dev-requirements.txt index 81c23b0562..59d0561bea 100644 --- a/src/airflow/dev-requirements.txt +++ b/src/airflow/dev-requirements.txt @@ -18,4 +18,5 @@ mlflow lightgbm gcsfs pymongo -pytest \ No newline at end of file +pytest +category_encoders \ No newline at end of file diff --git a/src/airflow/requirements.txt b/src/airflow/requirements.txt index c79865c3cf..947c051adc 100644 --- a/src/airflow/requirements.txt +++ b/src/airflow/requirements.txt @@ -18,4 +18,5 @@ gcsfs pymongo~=4.4.1 pytest~=7.4.0 -scipy~=1.11.1 \ No newline at end of file +scipy~=1.11.1 +category_encoders \ No newline at end of file From 9d541f1e15ea548fc8c90c3ba8a2ba990db1682d Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Sun, 20 Aug 2023 23:30:35 +0300 Subject: [PATCH 02/43] training job code setup --- src/airflow/airqo_etl_utils/air_beam_api.py | 58 +-- src/airflow/airqo_etl_utils/airnow_api.py | 8 +- src/airflow/airqo_etl_utils/airqo_api.py | 19 +- src/airflow/airqo_etl_utils/bigquery_api.py | 8 +- src/airflow/airqo_etl_utils/config.py | 1 + src/airflow/airqo_etl_utils/ml_utils.py | 434 ++++++++---------- src/airflow/airqo_etl_utils/plume_labs_api.py | 17 +- src/airflow/airqo_etl_utils/purple_air_api.py | 18 +- src/airflow/airqo_etl_utils/tahmo_api.py | 21 +- 9 files changed, 278 insertions(+), 306 deletions(-) diff --git a/src/airflow/airqo_etl_utils/air_beam_api.py b/src/airflow/airqo_etl_utils/air_beam_api.py index 57adacfcf9..657d69dbf4 100644 --- a/src/airflow/airqo_etl_utils/air_beam_api.py +++ b/src/airflow/airqo_etl_utils/air_beam_api.py @@ -24,25 +24,25 @@ def get_stream_ids( username: str, pollutant: str, ): - params={ - "q": json.dumps( - { - "time_from": int(start_date_time.timestamp()), - "time_to": int(end_date_time.timestamp()), - "tags": "", - "usernames": username, - "west": 10.581214853439886, - "east": 38.08577769782265, - "south": -36.799337832603314, - "north": -19.260169583742446, - "limit": 100, - "offset": 0, - "sensor_name": f"airbeam3-{pollutant}", - "measurement_type": "Particulate Matter", - "unit_symbol": "µg/m³", - } - ) - } + params = { + "q": json.dumps( + { + "time_from": int(start_date_time.timestamp()), + "time_to": int(end_date_time.timestamp()), + "tags": "", + "usernames": username, + "west": 10.581214853439886, + "east": 38.08577769782265, + "south": -36.799337832603314, + "north": -19.260169583742446, + "limit": 100, + "offset": 0, + "sensor_name": f"airbeam3-{pollutant}", + "measurement_type": "Particulate Matter", + "unit_symbol": "µg/m³", + } + ) + } request = self.__request( endpoint=f"mobile/sessions.json", params=params, @@ -65,32 +65,32 @@ def get_measurements( endpoint=f"measurements.json", params=params, ) - - def __request(self, endpoint, params): + def __request(self, endpoint, params): url = f"{self.AIR_BEAM_BASE_URL}{endpoint}" retry_strategy = Retry( total=5, backoff_factor=5, ) - + http = urllib3.PoolManager(retries=retry_strategy) - + try: response = http.request( - "GET", - url, - fields=params,) - + "GET", + url, + fields=params, + ) + response_data = response.data print(response._request_url) - + if response.status == 200: return json.loads(response_data) else: Utils.handle_api_error(response) return None - + except urllib3.exceptions.HTTPError as e: print(f"HTTPError: {e}") return None diff --git a/src/airflow/airqo_etl_utils/airnow_api.py b/src/airflow/airqo_etl_utils/airnow_api.py index 851afe89cb..a1e65b8189 100644 --- a/src/airflow/airqo_etl_utils/airnow_api.py +++ b/src/airflow/airqo_etl_utils/airnow_api.py @@ -56,20 +56,20 @@ def __request(self, endpoint, params, api_key): total=5, backoff_factor=5, ) - + http = urllib3.PoolManager(retries=retry_strategy) - + try: response = http.request("GET", url, fields=params) response_data = response.data print(response._request_url) - + if response.status == 200: return json.loads(response_data) else: Utils.handle_api_error(response) return None - + except urllib3.exceptions.HTTPError as e: print(f"HTTPError: {e}") return None diff --git a/src/airflow/airqo_etl_utils/airqo_api.py b/src/airflow/airqo_etl_utils/airqo_api.py index 876f630a9d..e80cd7bf39 100644 --- a/src/airflow/airqo_etl_utils/airqo_api.py +++ b/src/airflow/airqo_etl_utils/airqo_api.py @@ -322,7 +322,7 @@ def __request(self, endpoint, params=None, body=None, method=None, base_url=None params.update({"token": self.AIRQO_API_TOKEN}) retry_strategy = Retry( - total=5, + total=5, backoff_factor=5, ) @@ -338,21 +338,21 @@ def __request(self, endpoint, params=None, body=None, method=None, base_url=None encoded_args = urlencode(params) url = url + "?" + encoded_args response = http.request( - "PUT", + "PUT", url, - headers=headers, - body=simplejson.dumps(body, ignore_nan=True) - ) + headers=headers, + body=simplejson.dumps(body, ignore_nan=True), + ) elif method == "post": headers["Content-Type"] = "application/json" encoded_args = urlencode(params) url = url + "?" + encoded_args response = http.request( - "POST", + "POST", url, - headers=headers, - body=simplejson.dumps(body, ignore_nan=True) - ) + headers=headers, + body=simplejson.dumps(body, ignore_nan=True), + ) else: handle_api_error("Invalid") return None @@ -368,4 +368,3 @@ def __request(self, endpoint, params=None, body=None, method=None, base_url=None except urllib3.exceptions.HTTPError as e: print(f"HTTPError: {e}") return None - diff --git a/src/airflow/airqo_etl_utils/bigquery_api.py b/src/airflow/airqo_etl_utils/bigquery_api.py index 879f25db5b..82212655e2 100644 --- a/src/airflow/airqo_etl_utils/bigquery_api.py +++ b/src/airflow/airqo_etl_utils/bigquery_api.py @@ -630,8 +630,10 @@ def fetch_data(self, start_date_time: str, historical: bool = False): df = self.client.query(f"{query}", job_config).result().to_dataframe() return df - def \ - fetch_training_data(self, start_date_time:str,) -> pd.DataFrame: + def fetch_training_data( + self, + start_date_time: str, + ) -> pd.DataFrame: query = f""" SELECT DISTINCT t1.device_id, @@ -649,7 +651,7 @@ def \ job_config = bigquery.QueryJobConfig() job_config.use_query_cache = True - + df = self.client.query(f"{query}", job_config).result().to_dataframe() return df diff --git a/src/airflow/airqo_etl_utils/config.py b/src/airflow/airqo_etl_utils/config.py index 065efecdb5..9176da5f2e 100644 --- a/src/airflow/airqo_etl_utils/config.py +++ b/src/airflow/airqo_etl_utils/config.py @@ -170,6 +170,7 @@ class Config: FORECAST_MODELS_BUCKET = os.getenv("FORECAST_MODELS_BUCKET") MONGO_URI = os.getenv("MONGO_URI") MONGO_DATABASE_NAME = os.getenv("MONGO_DATABASE_NAME") + ENVIRONMENT = os.getenv("ENVIRONMENT") configuration = Config() diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index 380e8e58a3..ef9af89e76 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -1,13 +1,14 @@ +import json from datetime import datetime import gcsfs import joblib import mlflow import numpy as np +import optuna import pandas as pd import pymongo as pm -from category_encoders import OneHotEncoder, CountEncoder -from lightgbm import LGBMRegressor, early_stopping +from lightgbm import LGBMRegressor from scipy.stats import skew from sklearn.metrics import mean_squared_error @@ -16,6 +17,7 @@ fixed_columns = ["site_id"] project_id = configuration.GOOGLE_CLOUD_PROJECT_ID bucket = configuration.FORECAST_MODELS_BUCKET +environment = configuration.ENVIRONMENT def get_trained_model_from_gcs(project_name, bucket_name, source_blob_name): @@ -30,8 +32,6 @@ def upload_trained_model_to_gcs( trained_model, project_name, bucket_name, source_blob_name ): fs = gcsfs.GCSFileSystem(project=project_name) - - # backup previous model try: fs.rename( f"{bucket_name}/{source_blob_name}", @@ -46,8 +46,22 @@ def upload_trained_model_to_gcs( job = joblib.dump(trained_model, handle) +def upload_mapping_to_gcs(mapping_dict, project_name, bucket_name, source_blob_name): + fs = gcsfs.GCSFileSystem(project=project_name) + mapping_dict = json.dumps(mapping_dict) + with fs.open(bucket_name + "/" + source_blob_name, "w") as f: + f.write(mapping_dict) + + +def get_mapping_from_gcs(project_name, bucket_name, source_blob_name): + fs = gcsfs.GCSFileSystem(project=project_name) + with fs.open(bucket_name + "/" + source_blob_name, "r") as f: + mapping_dict = json.load(f) + return mapping_dict + + class ForecastUtils: - ###FORECAST MODEL TRAINING UTILS#### + # FORECAST MODEL TRAINING UTILS @staticmethod def preprocess_training_data(data, frequency): data["timestamp"] = pd.to_datetime(data["timestamp"]) @@ -68,7 +82,7 @@ def preprocess_training_data(data, frequency): return data @staticmethod - def feature_eng_training_data(data, target_column, frequency): + def feature_eng_training_data(data, target_column, frequency): def get_lag_features(df, target_col, freq): df = df.sort_values(by=["device_id", "timestamp"]) @@ -84,28 +98,24 @@ def get_lag_features(df, target_col, freq): for s in shifts: for f in functions: df[f"pm2_5_{f}_{s}_day"] = ( - df.groupby(["device_number"])[target_col] + df.groupby(["device_id"])[target_col] .shift(1) .rolling(s) .agg(f) ) elif freq == "hourly": - shifts = [ - 1, - 2, - ] # TODO: Review to increase these both in training and the actual job + shifts = [1, 2, 6, 12] for s in shifts: - df[f"pm2_5_last_{s}_hour"] = df.groupby(["device_number"])[ + df[f"pm2_5_last_{s}_hour"] = df.groupby(["device_id"])[ target_col ].shift(s) - # lag features - shifts = [6, 12, 24, 48] + shifts = [3, 6, 12, 24] functions = ["mean", "std", "median", "skew"] for s in shifts: for f in functions: df[f"pm2_5_{f}_{s}_hour"] = ( - df.groupby(["device_number"])[target_col] + df.groupby(["device_id"])[target_col] .shift(1) .rolling(s) .agg(f) @@ -115,270 +125,232 @@ def get_lag_features(df, target_col, freq): return df - def get_other_features(df_tmp, freq): - # TODO: Experiment on impact of features - attributes = ["year", "month", "day", "dayofweek"] - if freq == "hourly": - attributes.extend(["hour", "minute"]) - for a in attributes: - df_tmp[a] = df_tmp["created_at"].dt.__getattribute__(a) - df_tmp["week"] = df_tmp["created_at"].dt.isocalendar().week.astype(int) - - print("Additional features added") - return df_tmp - def encode_categorical_features(df_tmp): - #use count_encoding on site_id & device_id,also save the real values & what they've been encoded to in a dictionary - - #encode site_id - site_id_encoder = CountEncoder() - site_id_encoder.fit(df_tmp['site_id']) - df_tmp['site_id'] = site_id_encoder.transform(df_tmp['site_id']) - # site_id_encoder_dict = site_id_encoder.mapping[0]['mapping'] - - #encode device_id - device_id_encoder = CountEncoder() - device_id_encoder.fit(df_tmp['device_id']) - df_tmp['device_id'] = device_id_encoder.transform(df_tmp['device_id']) - # device_id_encoder_dict = device_id_encoder.mapping[0]['mapping'] - - device_category_encoder = OneHotEncoder(cols=['device_category']) - df_tmp = device_category_encoder.fit_transform(df_tmp) + def count_encode_categorical_features(df): + device_id_mappings, site_id_mappings, device_category_mappings = {}, {}, {} + for col in ["device_id", "site_id", "device_category"]: + counts = df[col].value_counts() + count_dict = dict(zip(counts.index, counts.values)) + if col == "device_id": + device_id_mappings = count_dict + elif col == "site_id": + site_id_mappings = count_dict + elif col == "device_category": + device_category_mappings = count_dict + df[f"{col}"] = df[col].map(count_dict) + mappings = [device_id_mappings, site_id_mappings, device_category_mappings] + for mapping in mappings: + upload_mapping_to_gcs(mapping, project_id, bucket, f"{mapping}.json") - return df_tmp + return df - def get_time_and_cyclic_features(df): + def get_time_and_cyclic_features(df, freq): attributes = ["year", "month", "day", "dayofweek", "hour"] max_vals = [2023, 12, 31, 6, 23] + if freq == "hourly": + attributes.append("minute") + max_vals.append(59) for a, m in zip(attributes, max_vals): df[a] = df["timestamp"].dt.__getattribute__(a) df[a + "_sin"] = np.sin(2 * np.pi * df[a] / m) df[a + "_cos"] = np.cos(2 * np.pi * df[a] / m) - + df["week"] = df["timestamp"].dt.isocalendar().week df["week_sin"] = np.sin(2 * np.pi * df["week"] / 52) df["week_cos"] = np.cos(2 * np.pi * df["week"] / 52) df.drop(columns=attributes, inplace=True) return df - data["timestamp"] = pd.to_datetime(data["timestamp"]) - df_tmp = get_other_features(data, frequency) - df_tmp = get_lag_features(df_tmp, target_column, frequency) - df_tmp = encode_categorical_features(df_tmp) - df_tmp = get_time_and_cyclic_features(df_tmp) + df_tmp = get_lag_features(data, target_column, frequency) + df_tmp = count_encode_categorical_features(df_tmp) + df_tmp = get_time_and_cyclic_features(df_tmp, frequency) return df_tmp @staticmethod - def train_and_save_hourly_forecast_model(train): # separate code for hourly model + def train_and_save_forecast_models(train, frequency): """ Perform the actual training for hourly data """ - train["created_at"] = pd.to_datetime(train["created_at"]) - train = train.sort_values(by=["device_number", "created_at"]) - features = [c for c in train.columns if c not in ["created_at", "pm2_5"]] + train["timestamp"] = pd.to_datetime(train["timestamp"]) + train = train.sort_values(by=["device_id", "timestamp"]) + features = [c for c in train.columns if c not in ["timestamp", "pm2_5"]] print(features) target_col = "pm2_5" - train_data, test_data = pd.DataFrame(), pd.DataFrame() - for device_number in train["device_number"].unique(): - device_df = train[train["device_number"] == device_number] - device_df = device_df.sort_values(by="created_at") - months = device_df["created_at"].dt.month.unique() - train_months = months[:4] - test_months = months[4:] - train_df = device_df[device_df["created_at"].dt.month.isin(train_months)] - test_df = device_df[device_df["created_at"].dt.month.isin(test_months)] + train_data, validation_data, test_data = ( + pd.DataFrame(), + pd.DataFrame(), + pd.DataFrame(), + ) + for device in train["device_id"].unique(): + device_df = train[train["device_id"] == device] + device_df = device_df.sort_values(by="timestamp") + months = device_df["timestamp"].dt.month.unique() + train_months = val_months = test_months = 0 + if frequency == "hourly": + train_months = months[:8] + val_months = months[8:9] + test_months = months[9:] + elif frequency == "daily": + train_months = months[:10] + val_months = months[10:11] + test_months = months[11:] + + train_df = device_df[device_df["timestamp"].dt.month.isin(train_months)] + val_df = device_df[device_df["timestamp"].dt.month.isin(val_months)] + test_df = device_df[device_df["timestamp"].dt.month.isin(test_months)] train_data = pd.concat([train_data, train_df]) + validation_data = pd.concat([validation_data, val_df]) test_data = pd.concat([test_data, test_df]) - train_data.drop(columns=["created_at"], axis=1, inplace=True) - test_data.drop(columns=["created_at"], axis=1, inplace=True) + train_data.drop(columns=["timestamp"], axis=1, inplace=True) + validation_data.drop(columns=["timestamp"], axis=1, inplace=True) + test_data.drop(columns=["timestamp"], axis=1, inplace=True) - train_target, test_target = train_data[target_col], test_data[target_col] + train_target, validation_target, test_target = ( + train_data[target_col], + validation_data[target_col], + test_data[target_col], + ) - with mlflow.start_run(): - print("Model training started.....") - n_estimators = 5000 - learning_rate = 0.05 - colsample_bytree = 0.4 - reg_alpha = 0 - reg_lambda = 1 - max_depth = 1 - random_state = 1 + mlflow.set_tracking_uri(configuration.MLFLOW_TRACKING_URI) + mlflow.set_experiment(f"LGBM_{frequency}_forecast_model_{environment}") + registered_model_name = f"LGBM_{frequency}_forecast_model_{environment}" - clf = LGBMRegressor( - n_estimators=n_estimators, - learning_rate=learning_rate, - colsample_bytree=colsample_bytree, - reg_alpha=reg_alpha, - reg_lambda=reg_lambda, - max_depth=max_depth, - random_state=random_state, - ) + mlflow.lightgbm.autolog(registered_model_name=registered_model_name) - clf.fit( - train_data[features], - train_target, - eval_set=[(test_data[features], test_target)], - callbacks=[early_stopping(stopping_rounds=150)], - eval_metric="rmse", - ) - print("Model training completed.....") - - # Log parameters - mlflow.log_param("n_estimators", n_estimators) - mlflow.log_param("learning_rate", learning_rate) - mlflow.log_param("colsample_bytree", colsample_bytree) - mlflow.log_param("reg_alpha", reg_alpha) - mlflow.log_param("reg_lamba", reg_lambda) - mlflow.log_param("max_depth", max_depth) - mlflow.log_param("random_state", random_state) - - # Log moder - mlflow.sklearn.log_model( - sk_model=clf, - artifact_path="hourly_forecast_model", - registered_model_name=f"LGBM_hourly_forecast_model_development", - ) - - print("Being model validation.....") - - val_preds = clf.predict(test_data[features]) - rmse_val = mean_squared_error(test_data[target_col], val_preds) ** 0.5 + sampler = optuna.samplers.TPESampler() + pruner = optuna.pruners.SuccessiveHalvingPruner( + min_resource=10, reduction_factor=2, min_early_stopping_rate=0 + ) + study = optuna.create_study( + direction="minimize", study_name="LGBM", sampler=sampler, pruner=pruner + ) - print("Model validation completed.....") - print(f"Validation RMSE is {rmse_val}") + def objective(trial): + param_grid = { + "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.1, 1), + "reg_alpha": trial.suggest_uniform("reg_alpha", 0, 10), + "reg_lambda": trial.suggest_uniform("reg_lambda", 0, 10), + "n_estimators": trial.suggest_categorical("n_estimators", [10000]), + "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3), + "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20), + "max_depth": trial.suggest_int("max_depth", 3, 12), + "min_split_gain": trial.suggest_float("min_gain_to_split", 0, 15), + } - # Log metrics - mlflow.log_metric("VAL_RMSE", rmse_val) + score = 0 + for step in range(4): + lgb_reg = LGBMRegressor( + objective="regression", + n_jobs=2, + random_state=42, + **param_grid, + verbosity=2, + ) + lgb_reg.fit( + train_data[features], + train_target, + categorical_feature=["device_id", "site_id", "device_category"], + eval_set=[(test_data[features], test_target)], + eval_metric="rmse", + ) - best_iter = clf.best_iteration_ - clf = LGBMRegressor( - n_estimators=best_iter, - learning_rate=0.05, - colsample_bytree=0.4, - reg_alpha=2, - reg_lambda=1, - max_depth=-1, - random_state=1, - verbosity=2, - ) - train["device_number"] = train["device_number"].astype(int) - clf.fit(train[features], train[target_col]) - upload_trained_model_to_gcs(clf, project_id, bucket, "hourly_forecast_model") + val_preds = lgb_reg.predict(validation_data[features]) + score = mean_squared_error(validation_target, val_preds) + if trial.should_prune(): + raise optuna.TrialPruned() - @staticmethod - def train_and_save_daily_forecast_model(train): # separate code for monthly model - train["timestamp"] = pd.to_datetime(train["timestamp"]) - train = train.sort_values(by=['device_id', 'timestamp']) - features = [c for c in train.columns if c not in ["timestamp", "pm2_5"]] - print(features) - target_col = "pm2_5" - train_data, test_data = pd.DataFrame(), pd.DataFrame() - - for device_number in train["device_number"].unique(): - device_df = train[train["device_number"] == device_number] - device_df = device_df.sort_values(by="created_at") - months = device_df["created_at"].dt.month.unique() - train_months = months[:8] - test_months = months[8:] - train_df = device_df[device_df["created_at"].dt.month.isin(train_months)] - test_df = device_df[device_df["created_at"].dt.month.isin(test_months)] - train_data = pd.concat([train_data, train_df]) - test_data = pd.concat([test_data, test_df]) + return score - train_data.drop(columns=["created_at"], axis=1, inplace=True) - test_data.drop(columns=["created_at"], axis=1, inplace=True) + study.optimize(objective, n_trials=150) - train_target, test_target = train_data[target_col], test_data[target_col] with mlflow.start_run(): - print("Model training started.....") - n_estimators = 5000 - learning_rate = 0.05 - colsample_bytree = 0.4 - reg_alpha = 0 - reg_lambda = 1 - max_depth = 1 - random_state = 1 - + best_params = study.best_params + print(f"Best params are {best_params}") clf = LGBMRegressor( - n_estimators=n_estimators, - learning_rate=learning_rate, - colsample_bytree=colsample_bytree, - reg_alpha=reg_alpha, - reg_lambda=reg_lambda, - max_depth=max_depth, - random_state=random_state, + n_estimators=best_params["n_estimators"], + learning_rate=best_params["learning_rate"], + colsample_bytree=best_params["colsample_bytree"], + reg_alpha=best_params["reg_alpha"], + reg_lambda=best_params["reg_lambda"], + max_depth=best_params["max_depth"], + random_state=42, + verbosity=2, ) clf.fit( train_data[features], train_target, eval_set=[(test_data[features], test_target)], - callbacks=[early_stopping(stopping_rounds=150)], eval_metric="rmse", - ) - print("Model training completed.....") - - # Log parameters - mlflow.log_param("n_estimators", n_estimators) - mlflow.log_param("learning_rate", learning_rate) - mlflow.log_param("colsample_bytree", colsample_bytree) - mlflow.log_param("reg_alpha", reg_alpha) - mlflow.log_param("reg_lamba", reg_lambda) - mlflow.log_param("max_depth", max_depth) - mlflow.log_param("random_state", random_state) - - # Log model - mlflow.sklearn.log_model( - sk_model=clf, - artifact_path="daily_forecast_model", - registered_model_name=f"LGBM_daily_forecast_model_development", + categorical_feature=["device_id", "site_id", "device_category"], ) - # model validation - print("Being model validation.....") + # train quantile regression models for 0.025 and 0.975 quantiles + clf_025 = LGBMRegressor( + n_estimators=best_params["n_estimators"], + learning_rate=best_params["learning_rate"], + colsample_bytree=best_params["colsample_bytree"], + reg_alpha=best_params["reg_alpha"], + reg_lambda=best_params["reg_lambda"], + max_depth=best_params["max_depth"], + random_state=42, + verbosity=2, + objective="quantile", + alpha=0.025, + metric="quantile", + ) - val_preds = clf.predict(test_data[features]) - rmse_val = mean_squared_error(test_data[target_col], val_preds) ** 0.5 + clf_025.fit( + train_data[features], + train_target, + eval_set=[(test_data[features], test_target)], + categorical_feature=["device_id", "site_id", "device_category"], + ) - print("Model validation completed.....") - print(f"Validation RMSE is {rmse_val}") + clf_975 = LGBMRegressor( + n_estimators=best_params["n_estimators"], + learning_rate=best_params["learning_rate"], + colsample_bytree=best_params["colsample_bytree"], + reg_alpha=best_params["reg_alpha"], + reg_lambda=best_params["reg_lambda"], + max_depth=best_params["max_depth"], + random_state=42, + verbosity=2, + objective="quantile", + alpha=0.975, + metric="quantile", + ) - # Log metrics - mlflow.log_metric("VAL_RMSE", rmse_val) + clf_975.fit( + train_data[features], + train_target, + eval_set=[(test_data[features], test_target)], + categorical_feature=["device_id", "site_id", "device_category"], + ) - best_iter = clf.best_iteration_ - clf = LGBMRegressor( - n_estimators=best_iter, - learning_rate=0.05, - colsample_bytree=0.4, - reg_alpha=2, - reg_lambda=1, - max_depth=-1, - random_state=1, + upload_trained_model_to_gcs( + clf, project_id, bucket, "hourly_forecast_model.pkl" ) - clf.fit(train[features], train[target_col]) - upload_trained_model_to_gcs(clf, project_id, bucket, "daily_forecast_model.pkl") - print("Model saved successfully") #### FORECAST JOB UTILS #### @staticmethod def preprocess_historical_data(data, frequency): - data["created_at"] = pd.to_datetime(data["created_at"]) + data["timestamp"] = pd.to_datetime(data["timestamp"]) data["device_number"] = data["device_number"].astype(str) data["pm2_5"] = data.groupby(fixed_columns + ["device_number"])[ "pm2_5" ].transform(lambda x: x.interpolate(method="linear", limit_direction="both")) if frequency == "hourly": data.sort_values( - by=fixed_columns + ["device_number", "created_at"], inplace=True + by=fixed_columns + ["device_number", "timestamp"], inplace=True ) elif frequency == "daily": data = ( data.groupby(fixed_columns + ["device_number"]) - .resample("D", on="created_at") + .resample("D", on="timestamp") .mean(numeric_only=True) ) data.reset_index(inplace=True) @@ -388,7 +360,7 @@ def preprocess_historical_data(data, frequency): lambda x: x.interpolate(method="linear", limit_direction="both") ) data.sort_values( - by=fixed_columns + ["device_number", "created_at"], inplace=True + by=fixed_columns + ["device_number", "timestamp"], inplace=True ) else: raise ValueError("Invalid frequency argument") @@ -398,8 +370,8 @@ def preprocess_historical_data(data, frequency): @staticmethod def get_lag_features(df_tmp, TARGET_COL, frequency): - df_tmp["created_at"] = pd.to_datetime(df_tmp["created_at"]) - df_tmp = df_tmp.sort_values(by=fixed_columns + ["device_number", "created_at"]) + df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"]) + df_tmp = df_tmp.sort_values(by=fixed_columns + ["device_number", "timestamp"]) if frequency == "hourly": shifts = [1, 2] for s in shifts: @@ -440,27 +412,27 @@ def get_lag_features(df_tmp, TARGET_COL, frequency): @staticmethod def get_time_features(df_tmp, frequency): - df_tmp["created_at"] = pd.to_datetime(df_tmp["created_at"]) + df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"]) attributes = ["year", "month", "day", "dayofweek"] if frequency == "hourly": attributes.extend(["hour", "minute"]) for a in attributes: - df_tmp[a] = df_tmp["created_at"].dt.__getattribute__(a) + df_tmp[a] = df_tmp["timestamp"].dt.__getattribute__(a) - df_tmp["week"] = df_tmp["created_at"].dt.isocalendar().week + df_tmp["week"] = df_tmp["timestamp"].dt.isocalendar().week print("Adding other features") return df_tmp @staticmethod def generate_hourly_forecasts(data, project_name, bucket_name, source_blob_name): - data["created_at"] = pd.to_datetime(data["created_at"]) + data["timestamp"] = pd.to_datetime(data["timestamp"]) def get_new_row(df, device1, model): last_row = df[df["device_number"] == device1].iloc[-1] new_row = pd.Series(index=last_row.index, dtype="float64") for i in fixed_columns: new_row[i] = last_row[i] - new_row["created_at"] = last_row["created_at"] + pd.Timedelta(hours=1) + new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(hours=1) new_row["device_number"] = device1 new_row[f"pm2_5_last_1_hour"] = last_row["pm2_5"] new_row[f"pm2_5_last_2_hour"] = last_row[f"pm2_5_last_{1}_hour"] @@ -498,11 +470,11 @@ def get_new_row(df, device1, model): attributes = ["year", "month", "day", "dayofweek", "hour", "minute"] for a in attributes: - new_row[a] = new_row["created_at"].__getattribute__(a) - new_row["week"] = new_row["created_at"].isocalendar().week + new_row[a] = new_row["timestamp"].__getattribute__(a) + new_row["week"] = new_row["timestamp"].isocalendar().week new_row["pm2_5"] = model.predict( - new_row.drop(fixed_columns + ["created_at", "pm2_5"]).values.reshape( + new_row.drop(fixed_columns + ["timestamp", "pm2_5"]).values.reshape( 1, -1 ) )[0] @@ -524,7 +496,7 @@ def get_new_row(df, device1, model): forecasts["device_number"] = forecasts["device_number"].astype(int) forecasts["pm2_5"] = forecasts["pm2_5"].astype(float) - forecasts.rename(columns={"created_at": "time"}, inplace=True) + forecasts.rename(columns={"timestamp": "time"}, inplace=True) forecasts["time"] = pd.to_datetime(forecasts["time"], utc=True) current_time = datetime.utcnow() current_time_utc = pd.Timestamp(current_time, tz="UTC") @@ -536,14 +508,14 @@ def get_new_row(df, device1, model): @staticmethod def generate_daily_forecasts(data, project_name, bucket_name, source_blob_name): - data["created_at"] = pd.to_datetime(data["created_at"]) + data["timestamp"] = pd.to_datetime(data["timestamp"]) def get_new_row(df_tmp, device, model): last_row = df_tmp[df_tmp["device_number"] == device].iloc[-1] new_row = pd.Series(index=last_row.index, dtype="float64") for i in fixed_columns: new_row[i] = last_row[i] - new_row["created_at"] = last_row["created_at"] + pd.Timedelta(days=1) + new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(days=1) new_row["device_number"] = device new_row[f"pm2_5_last_1_day"] = last_row["pm2_5"] new_row[f"pm2_5_last_2_day"] = last_row[f"pm2_5_last_{1}_day"] @@ -577,11 +549,11 @@ def get_new_row(df_tmp, device, model): # Use the date of the new row to create other features attributes = ["year", "month", "day", "dayofweek"] for a in attributes: - new_row[a] = new_row["created_at"].__getattribute__(a) - new_row["week"] = new_row["created_at"].isocalendar().week + new_row[a] = new_row["timestamp"].__getattribute__(a) + new_row["week"] = new_row["timestamp"].isocalendar().week new_row["pm2_5"] = model.predict( - new_row.drop(fixed_columns + ["created_at", "pm2_5"]).values.reshape( + new_row.drop(fixed_columns + ["timestamp", "pm2_5"]).values.reshape( 1, -1 ) )[0] @@ -608,7 +580,7 @@ def get_new_row(df_tmp, device, model): forecasts = pd.concat([forecasts, test_copy], ignore_index=True) forecasts["device_number"] = forecasts["device_number"].astype(int) forecasts["pm2_5"] = forecasts["pm2_5"].astype(float) - forecasts.rename(columns={"created_at": "time"}, inplace=True) + forecasts.rename(columns={"timestamp": "time"}, inplace=True) current_time = datetime.utcnow() current_time_utc = pd.Timestamp(current_time, tz="UTC") result = forecasts[fixed_columns + ["time", "pm2_5", "device_number"]][ @@ -619,7 +591,7 @@ def get_new_row(df_tmp, device, model): @staticmethod def save_forecasts_to_mongo(data, frequency): - created_at = pd.to_datetime(datetime.now()).isoformat() + timestamp = pd.to_datetime(datetime.now()).isoformat() device_numbers = data["device_number"].unique() forecast_results = [ { @@ -628,7 +600,7 @@ def save_forecasts_to_mongo(data, frequency): else data[data["device_number"] == i][field].tolist() for field in data.columns } - | {"created_at": created_at} + | {"timestamp": timestamp} for i in device_numbers ] client = pm.MongoClient(configuration.MONGO_URI) diff --git a/src/airflow/airqo_etl_utils/plume_labs_api.py b/src/airflow/airqo_etl_utils/plume_labs_api.py index b3ba62e0ac..def07e7114 100644 --- a/src/airflow/airqo_etl_utils/plume_labs_api.py +++ b/src/airflow/airqo_etl_utils/plume_labs_api.py @@ -182,24 +182,25 @@ def __request(self, endpoint, params): total=5, backoff_factor=5, ) - + http = urllib3.PoolManager(retries=retry_strategy) - + try: response = http.request( - "GET", - url, - fields=params,) - + "GET", + url, + fields=params, + ) + response_data = response.data print(response._request_url) - + if response.status == 200: return json.loads(response_data) else: Utils.handle_api_error(response) return None - + except urllib3.exceptions.HTTPError as e: print(f"HTTPError: {e}") return None diff --git a/src/airflow/airqo_etl_utils/purple_air_api.py b/src/airflow/airqo_etl_utils/purple_air_api.py index b6dd0ec4d4..025f26283d 100644 --- a/src/airflow/airqo_etl_utils/purple_air_api.py +++ b/src/airflow/airqo_etl_utils/purple_air_api.py @@ -32,31 +32,31 @@ def get_data( return response if response else {} def __request(self, endpoint, params): - url = f"{self.PURPLE_AIR_BASE_URL}{endpoint}" retry_strategy = Retry( total=5, backoff_factor=5, ) - + http = urllib3.PoolManager(retries=retry_strategy) - + try: response = http.request( - "GET", - url, + "GET", + url, fields=params, - headers={"x-api-key": self.PURPLE_AIR_API_KEY},) - + headers={"x-api-key": self.PURPLE_AIR_API_KEY}, + ) + response_data = response.data print(response._request_url) - + if response.status == 200: return json.loads(response_data) else: Utils.handle_api_error(response) return None - + except urllib3.exceptions.HTTPError as e: print(f"HTTPError: {e}") return None diff --git a/src/airflow/airqo_etl_utils/tahmo_api.py b/src/airflow/airqo_etl_utils/tahmo_api.py index 9f9db8a7f8..07a3e771eb 100644 --- a/src/airflow/airqo_etl_utils/tahmo_api.py +++ b/src/airflow/airqo_etl_utils/tahmo_api.py @@ -54,32 +54,29 @@ def get_measurements(self, start_time, end_time, station_codes=None): return measurements.to_dict(orient="records") def __request(self, endpoint, params): - url = f"{self.BASE_URL}{endpoint}" retry_strategy = Retry( total=5, backoff_factor=5, ) - + http = urllib3.PoolManager(retries=retry_strategy) - + try: - headers = urllib3.util.make_headers(basic_auth=f"{self.API_KEY}:{self.API_SECRET}") - response = http.request( - "GET", - url, - fields=params, - headers=headers) - + headers = urllib3.util.make_headers( + basic_auth=f"{self.API_KEY}:{self.API_SECRET}" + ) + response = http.request("GET", url, fields=params, headers=headers) + response_data = response.data print("Tahmo API request: %s" % response._request_url) - + if response.status == 200: return json.loads(response_data) else: Utils.handle_api_error(response) return None - + except urllib3.exceptions.HTTPError as e: print(f"HTTPError: {e}") return None From 9ec80cc95deecb53c43bb003da0f12f431ec7c18 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Mon, 21 Aug 2023 09:37:20 +0300 Subject: [PATCH 03/43] Fix category encoder --- src/airflow/airqo_etl_utils/bigquery_api.py | 2 +- src/airflow/airqo_etl_utils/ml_utils.py | 64 ++++++++++----------- 2 files changed, 30 insertions(+), 36 deletions(-) diff --git a/src/airflow/airqo_etl_utils/bigquery_api.py b/src/airflow/airqo_etl_utils/bigquery_api.py index 82212655e2..360fb74df0 100644 --- a/src/airflow/airqo_etl_utils/bigquery_api.py +++ b/src/airflow/airqo_etl_utils/bigquery_api.py @@ -639,7 +639,7 @@ def fetch_training_data( t1.device_id, t1.timestamp, t1.site_id, - t1.pm2_5_calibrated_value, + t1.pm2_5_calibrated_value as pm2_5, t2.latitude, t2.longitude, t3.device_category diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index ef9af89e76..3d551aa72a 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -1,4 +1,5 @@ import json +import random from datetime import datetime import gcsfs @@ -65,17 +66,17 @@ class ForecastUtils: @staticmethod def preprocess_training_data(data, frequency): data["timestamp"] = pd.to_datetime(data["timestamp"]) - data["pm2_5"] = data.groupby("device_id")["pm2_5"].transform( + data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])["pm2_5"].transform( lambda x: x.interpolate(method="linear", limit_direction="both") ) if frequency == "daily": data = ( - data.groupby(["device_id"]) + data.groupby(["device_id", "site_id", "device_category"]) .resample("D", on="timestamp") .mean(numeric_only=True) ) data.reset_index(inplace=True) - data["pm2_5"] = data.groupby("device_id")["pm2_5"].transform( + data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])["pm2_5"].transform( lambda x: x.interpolate(method="linear", limit_direction="both") ) data = data.dropna(subset=["pm2_5"]) @@ -84,8 +85,6 @@ def preprocess_training_data(data, frequency): @staticmethod def feature_eng_training_data(data, target_column, frequency): def get_lag_features(df, target_col, freq): - df = df.sort_values(by=["device_id", "timestamp"]) - if freq == "daily": shifts = [1, 2, 3, 7, 14] for s in shifts: @@ -125,30 +124,30 @@ def get_lag_features(df, target_col, freq): return df - def count_encode_categorical_features(df): - device_id_mappings, site_id_mappings, device_category_mappings = {}, {}, {} - for col in ["device_id", "site_id", "device_category"]: - counts = df[col].value_counts() - count_dict = dict(zip(counts.index, counts.values)) - if col == "device_id": - device_id_mappings = count_dict - elif col == "site_id": - site_id_mappings = count_dict - elif col == "device_category": - device_category_mappings = count_dict - df[f"{col}"] = df[col].map(count_dict) - mappings = [device_id_mappings, site_id_mappings, device_category_mappings] - for mapping in mappings: - upload_mapping_to_gcs(mapping, project_id, bucket, f"{mapping}.json") - + def encode_categorical_features(df): + columns = ["device_id", "site_id", "device_category"] + mappings = [] + for col in columns: + mapping = {} + for val in df[col].unique(): + num = random.randint(0, 10000) + while num in mapping.values(): + num = random.randint(0, 10000) + mapping[val] = num + df[col] = df[col].map(mapping) + mappings.append(mapping) + for i, col in enumerate(columns): + upload_mapping_to_gcs( + mappings[i], project_id, bucket, f"{col}_mapping.json" + ) return df def get_time_and_cyclic_features(df, freq): - attributes = ["year", "month", "day", "dayofweek", "hour"] + attributes = ["year", "month", "day", "dayofweek"] max_vals = [2023, 12, 31, 6, 23] if freq == "hourly": - attributes.append("minute") - max_vals.append(59) + attributes.extend(["hour", "minute"]) + max_vals.append([23, 59]) for a, m in zip(attributes, max_vals): df[a] = df["timestamp"].dt.__getattribute__(a) df[a + "_sin"] = np.sin(2 * np.pi * df[a] / m) @@ -162,7 +161,7 @@ def get_time_and_cyclic_features(df, freq): data["timestamp"] = pd.to_datetime(data["timestamp"]) df_tmp = get_lag_features(data, target_column, frequency) - df_tmp = count_encode_categorical_features(df_tmp) + df_tmp = encode_categorical_features(df_tmp) df_tmp = get_time_and_cyclic_features(df_tmp, frequency) return df_tmp @@ -173,24 +172,19 @@ def train_and_save_forecast_models(train, frequency): Perform the actual training for hourly data """ train["timestamp"] = pd.to_datetime(train["timestamp"]) - train = train.sort_values(by=["device_id", "timestamp"]) features = [c for c in train.columns if c not in ["timestamp", "pm2_5"]] print(features) target_col = "pm2_5" - train_data, validation_data, test_data = ( - pd.DataFrame(), - pd.DataFrame(), - pd.DataFrame(), - ) + train_data = validation_data = test_data = pd.DataFrame() for device in train["device_id"].unique(): device_df = train[train["device_id"] == device] device_df = device_df.sort_values(by="timestamp") months = device_df["timestamp"].dt.month.unique() - train_months = val_months = test_months = 0 + train_months = val_months = test_months = [] if frequency == "hourly": - train_months = months[:8] - val_months = months[8:9] - test_months = months[9:] + train_months = months[:10] + val_months = months[10:11] + test_months = months[11:] elif frequency == "daily": train_months = months[:10] val_months = months[10:11] From 1f766234753fcdbf9784ee4a0a7ae6787f0ff5d0 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Mon, 21 Aug 2023 23:03:16 +0300 Subject: [PATCH 04/43] code cleanup - training job --- src/airflow/airqo_etl_utils/ml_utils.py | 123 ++++++++++++------------ 1 file changed, 61 insertions(+), 62 deletions(-) diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index 3d551aa72a..b1131e53ad 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -9,7 +9,7 @@ import optuna import pandas as pd import pymongo as pm -from lightgbm import LGBMRegressor +from lightgbm import LGBMRegressor, early_stopping from scipy.stats import skew from sklearn.metrics import mean_squared_error @@ -178,17 +178,16 @@ def train_and_save_forecast_models(train, frequency): train_data = validation_data = test_data = pd.DataFrame() for device in train["device_id"].unique(): device_df = train[train["device_id"] == device] - device_df = device_df.sort_values(by="timestamp") months = device_df["timestamp"].dt.month.unique() train_months = val_months = test_months = [] if frequency == "hourly": - train_months = months[:10] - val_months = months[10:11] - test_months = months[11:] + train_months = months[:8] + val_months = months[9] + test_months = months[10] elif frequency == "daily": - train_months = months[:10] - val_months = months[10:11] - test_months = months[11:] + train_months = months[:8] + val_months = months[8:9] + test_months = months[9:] train_df = device_df[device_df["timestamp"].dt.month.isin(train_months)] val_df = device_df[device_df["timestamp"].dt.month.isin(val_months)] @@ -207,11 +206,6 @@ def train_and_save_forecast_models(train, frequency): test_data[target_col], ) - mlflow.set_tracking_uri(configuration.MLFLOW_TRACKING_URI) - mlflow.set_experiment(f"LGBM_{frequency}_forecast_model_{environment}") - registered_model_name = f"LGBM_{frequency}_forecast_model_{environment}" - - mlflow.lightgbm.autolog(registered_model_name=registered_model_name) sampler = optuna.samplers.TPESampler() pruner = optuna.pruners.SuccessiveHalvingPruner( @@ -223,21 +217,18 @@ def train_and_save_forecast_models(train, frequency): def objective(trial): param_grid = { - "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.1, 1), - "reg_alpha": trial.suggest_uniform("reg_alpha", 0, 10), - "reg_lambda": trial.suggest_uniform("reg_lambda", 0, 10), - "n_estimators": trial.suggest_categorical("n_estimators", [10000]), + "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1), + "reg_alpha": trial.suggest_float("reg_alpha", 0, 10), + "reg_lambda": trial.suggest_float("reg_lambda", 0, 10), + "n_estimators": trial.suggest_categorical("n_estimators", [50]), "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3), - "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20), - "max_depth": trial.suggest_int("max_depth", 3, 12), - "min_split_gain": trial.suggest_float("min_gain_to_split", 0, 15), + "num_leaves": trial.suggest_int("num_leaves", 20, 50), + "max_depth": trial.suggest_int("max_depth", 4, 7), } - score = 0 for step in range(4): lgb_reg = LGBMRegressor( objective="regression", - n_jobs=2, random_state=42, **param_grid, verbosity=2, @@ -248,6 +239,7 @@ def objective(trial): categorical_feature=["device_id", "site_id", "device_category"], eval_set=[(test_data[features], test_target)], eval_metric="rmse", + callbacks=[early_stopping(stopping_rounds=150)], ) val_preds = lgb_reg.predict(validation_data[features]) @@ -257,8 +249,14 @@ def objective(trial): return score - study.optimize(objective, n_trials=150) + study.optimize(objective, n_trials=15) + + + mlflow.set_tracking_uri(configuration.MLFLOW_TRACKING_URI) + mlflow.set_experiment(f"{frequency}_forecast_model_{environment}") + registered_model_name = f"{frequency}_forecast_model_{environment}" + mlflow.lightgbm.autolog(registered_model_name=registered_model_name, log_datasets=False) with mlflow.start_run(): best_params = study.best_params print(f"Best params are {best_params}") @@ -279,52 +277,53 @@ def objective(trial): eval_set=[(test_data[features], test_target)], eval_metric="rmse", categorical_feature=["device_id", "site_id", "device_category"], + callbacks=[early_stopping(stopping_rounds=150)], ) # train quantile regression models for 0.025 and 0.975 quantiles - clf_025 = LGBMRegressor( - n_estimators=best_params["n_estimators"], - learning_rate=best_params["learning_rate"], - colsample_bytree=best_params["colsample_bytree"], - reg_alpha=best_params["reg_alpha"], - reg_lambda=best_params["reg_lambda"], - max_depth=best_params["max_depth"], - random_state=42, - verbosity=2, - objective="quantile", - alpha=0.025, - metric="quantile", - ) + clf_025 = LGBMRegressor( + n_estimators=best_params["n_estimators"], + learning_rate=best_params["learning_rate"], + colsample_bytree=best_params["colsample_bytree"], + reg_alpha=best_params["reg_alpha"], + reg_lambda=best_params["reg_lambda"], + max_depth=best_params["max_depth"], + random_state=42, + verbosity=2, + objective="quantile", + alpha=0.025, + metric="quantile", + ) - clf_025.fit( - train_data[features], - train_target, - eval_set=[(test_data[features], test_target)], - categorical_feature=["device_id", "site_id", "device_category"], - ) + clf_025.fit( + train_data[features], + train_target, + eval_set=[(test_data[features], test_target)], + categorical_feature=["device_id", "site_id", "device_category"], + ) - clf_975 = LGBMRegressor( - n_estimators=best_params["n_estimators"], - learning_rate=best_params["learning_rate"], - colsample_bytree=best_params["colsample_bytree"], - reg_alpha=best_params["reg_alpha"], - reg_lambda=best_params["reg_lambda"], - max_depth=best_params["max_depth"], - random_state=42, - verbosity=2, - objective="quantile", - alpha=0.975, - metric="quantile", - ) + clf_975 = LGBMRegressor( + n_estimators=best_params["n_estimators"], + learning_rate=best_params["learning_rate"], + colsample_bytree=best_params["colsample_bytree"], + reg_alpha=best_params["reg_alpha"], + reg_lambda=best_params["reg_lambda"], + max_depth=best_params["max_depth"], + random_state=42, + verbosity=2, + objective="quantile", + alpha=0.975, + metric="quantile", + ) - clf_975.fit( - train_data[features], - train_target, - eval_set=[(test_data[features], test_target)], - categorical_feature=["device_id", "site_id", "device_category"], - ) + clf_975.fit( + train_data[features], + train_target, + eval_set=[(test_data[features], test_target)], + categorical_feature=["device_id", "site_id", "device_category"], + ) - upload_trained_model_to_gcs( + upload_trained_model_to_gcs( clf, project_id, bucket, "hourly_forecast_model.pkl" ) From 365746cde7b0e1e05dfe603953c6b42c734f3c84 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Mon, 21 Aug 2023 23:05:57 +0300 Subject: [PATCH 05/43] setup actual forecasting job --- src/airflow/airqo_etl_utils/ml_utils.py | 89 +------------------------ src/airflow/dags/ml_training_jobs.py | 4 +- 2 files changed, 3 insertions(+), 90 deletions(-) diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index b1131e53ad..a1e07f9a79 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -62,9 +62,8 @@ def get_mapping_from_gcs(project_name, bucket_name, source_blob_name): class ForecastUtils: - # FORECAST MODEL TRAINING UTILS @staticmethod - def preprocess_training_data(data, frequency): + def preprocess__data(data, frequency): data["timestamp"] = pd.to_datetime(data["timestamp"]) data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])["pm2_5"].transform( lambda x: x.interpolate(method="linear", limit_direction="both") @@ -329,92 +328,6 @@ def objective(trial): #### FORECAST JOB UTILS #### - @staticmethod - def preprocess_historical_data(data, frequency): - data["timestamp"] = pd.to_datetime(data["timestamp"]) - data["device_number"] = data["device_number"].astype(str) - data["pm2_5"] = data.groupby(fixed_columns + ["device_number"])[ - "pm2_5" - ].transform(lambda x: x.interpolate(method="linear", limit_direction="both")) - if frequency == "hourly": - data.sort_values( - by=fixed_columns + ["device_number", "timestamp"], inplace=True - ) - elif frequency == "daily": - data = ( - data.groupby(fixed_columns + ["device_number"]) - .resample("D", on="timestamp") - .mean(numeric_only=True) - ) - data.reset_index(inplace=True) - data["pm2_5"] = data.groupby(fixed_columns + ["device_number"])[ - "pm2_5" - ].transform( - lambda x: x.interpolate(method="linear", limit_direction="both") - ) - data.sort_values( - by=fixed_columns + ["device_number", "timestamp"], inplace=True - ) - else: - raise ValueError("Invalid frequency argument") - data["device_number"] = data["device_number"].astype(int) - data = data.dropna(subset=["pm2_5"]) - return data - - @staticmethod - def get_lag_features(df_tmp, TARGET_COL, frequency): - df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"]) - df_tmp = df_tmp.sort_values(by=fixed_columns + ["device_number", "timestamp"]) - if frequency == "hourly": - shifts = [1, 2] - for s in shifts: - df_tmp[f"pm2_5_last_{s}_hour"] = df_tmp.groupby(["device_number"])[ - TARGET_COL - ].shift(s) - - shifts = [6, 12, 24, 48] - functions = ["mean", "std", "median", "skew"] - for s in shifts: - for f in functions: - df_tmp[f"pm2_5_{f}_{s}_hour"] = ( - df_tmp.groupby(["device_number"])[TARGET_COL] - .shift(1) - .rolling(s) - .agg(f) - ) - elif frequency == "daily": - shifts = [1, 2] - for s in shifts: - df_tmp[f"pm2_5_last_{s}_day"] = df_tmp.groupby(["device_number"])[ - TARGET_COL - ].shift(s) - shifts = [3, 7, 14, 30] - functions = ["mean", "std", "max", "min"] - for s in shifts: - for f in functions: - df_tmp[f"pm2_5_{f}_{s}_day"] = ( - df_tmp.groupby(["device_number"])[TARGET_COL] - .shift(1) - .rolling(s) - .agg(f) - ) - else: - raise ValueError("Invalid frequency argument") - print("Adding lag features") - return df_tmp - - @staticmethod - def get_time_features(df_tmp, frequency): - df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"]) - attributes = ["year", "month", "day", "dayofweek"] - if frequency == "hourly": - attributes.extend(["hour", "minute"]) - for a in attributes: - df_tmp[a] = df_tmp["timestamp"].dt.__getattribute__(a) - - df_tmp["week"] = df_tmp["timestamp"].dt.isocalendar().week - print("Adding other features") - return df_tmp @staticmethod def generate_hourly_forecasts(data, project_name, bucket_name, source_blob_name): diff --git a/src/airflow/dags/ml_training_jobs.py b/src/airflow/dags/ml_training_jobs.py index 180f7f7ef2..69eaaa900f 100644 --- a/src/airflow/dags/ml_training_jobs.py +++ b/src/airflow/dags/ml_training_jobs.py @@ -29,7 +29,7 @@ def fetch_training_data_for_hourly_forecast_model(): @task() def preprocess_training_data_for_hourly_forecast_model(data): - return ForecastUtils.preprocess_training_data(data, "hourly") + return ForecastUtils.preprocess__data(data, "hourly") @task() def feature_engineer_training_data_for_hourly_forecast_model(data): @@ -53,7 +53,7 @@ def fetch_training_data_for_daily_forecast_model(): @task() def preprocess_training_data_for_daily_forecast_model(data): - return ForecastUtils.preprocess_training_data(data, "daily") + return ForecastUtils.preprocess__data(data, "daily") @task() def feature_engineer_data_for_daily_forecast_model(data): From 6372a12151b26258af6011910a28c39e19bde538 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Tue, 22 Aug 2023 09:41:50 +0300 Subject: [PATCH 06/43] update forecasting job to match training job --- src/airflow/airqo_etl_utils/ml_utils.py | 245 ++++++++---------------- src/airflow/dags/ml_training_jobs.py | 4 +- 2 files changed, 83 insertions(+), 166 deletions(-) diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index a1e07f9a79..d900eac7e2 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -10,7 +10,6 @@ import pandas as pd import pymongo as pm from lightgbm import LGBMRegressor, early_stopping -from scipy.stats import skew from sklearn.metrics import mean_squared_error from .config import configuration @@ -63,7 +62,7 @@ def get_mapping_from_gcs(project_name, bucket_name, source_blob_name): class ForecastUtils: @staticmethod - def preprocess__data(data, frequency): + def preprocess_data(data, frequency): data["timestamp"] = pd.to_datetime(data["timestamp"]) data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])["pm2_5"].transform( lambda x: x.interpolate(method="linear", limit_direction="both") @@ -82,7 +81,7 @@ def preprocess__data(data, frequency): return data @staticmethod - def feature_eng_training_data(data, target_column, frequency): + def feature_eng_data(data, target_column, frequency, job_type): def get_lag_features(df, target_col, freq): if freq == "daily": shifts = [1, 2, 3, 7, 14] @@ -155,13 +154,24 @@ def get_time_and_cyclic_features(df, freq): df["week"] = df["timestamp"].dt.isocalendar().week df["week_sin"] = np.sin(2 * np.pi * df["week"] / 52) df["week_cos"] = np.cos(2 * np.pi * df["week"] / 52) - df.drop(columns=attributes, inplace=True) + df.drop(columns=attributes + ["week"], inplace=True) return df + def decode_categorical_features(df): + columns = ["device_id", "site_id", "device_category"] + for col in columns: + mapping = get_mapping_from_gcs( + project_id, bucket, f"{col}_mapping.json" + ) + df[col] = df[col].map(mapping) + return df data["timestamp"] = pd.to_datetime(data["timestamp"]) df_tmp = get_lag_features(data, target_column, frequency) - df_tmp = encode_categorical_features(df_tmp) df_tmp = get_time_and_cyclic_features(df_tmp, frequency) + if job_type == "train": + df_tmp = encode_categorical_features(df_tmp) + elif job_type == "predict": + df_tmp = decode_categorical_features(df_tmp) return df_tmp @@ -326,173 +336,80 @@ def objective(trial): clf, project_id, bucket, "hourly_forecast_model.pkl" ) - #### FORECAST JOB UTILS #### - - - @staticmethod - def generate_hourly_forecasts(data, project_name, bucket_name, source_blob_name): - data["timestamp"] = pd.to_datetime(data["timestamp"]) - - def get_new_row(df, device1, model): - last_row = df[df["device_number"] == device1].iloc[-1] - new_row = pd.Series(index=last_row.index, dtype="float64") - for i in fixed_columns: - new_row[i] = last_row[i] - new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(hours=1) - new_row["device_number"] = device1 - new_row[f"pm2_5_last_1_hour"] = last_row["pm2_5"] - new_row[f"pm2_5_last_2_hour"] = last_row[f"pm2_5_last_{1}_hour"] - - shifts = [6, 12, 24, 48] - functions = ["mean", "std", "median", "skew"] - for s in shifts: - for f in functions: - if f == "mean": - new_row[f"pm2_5_{f}_{s}_hour"] = ( - last_row["pm2_5"] - + last_row[f"pm2_5_{f}_{s}_hour"] * (s - 1) - ) / s - elif f == "std": - new_row[f"pm2_5_{f}_{s}_hour"] = ( - np.sqrt( - (last_row["pm2_5"] - last_row[f"pm2_5_mean_{s}_hour"]) - ** 2 - + (last_row[f"pm2_5_{f}_{s}_hour"] ** 2 * (s - 1)) - ) - / s - ) - elif f == "median": - new_row[f"pm2_5_{f}_{s}_hour"] = np.median( - np.append( - last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_hour"] - ) - ) - elif f == "skew": - new_row[f"pm2_5_{f}_{s}_hour"] = skew( - np.append( - last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_hour"] - ) - ) - - attributes = ["year", "month", "day", "dayofweek", "hour", "minute"] - for a in attributes: - new_row[a] = new_row["timestamp"].__getattribute__(a) - new_row["week"] = new_row["timestamp"].isocalendar().week - - new_row["pm2_5"] = model.predict( - new_row.drop(fixed_columns + ["timestamp", "pm2_5"]).values.reshape( - 1, -1 - ) - )[0] - return new_row - - forecasts = pd.DataFrame() - forecast_model = get_trained_model_from_gcs( - project_name, bucket_name, source_blob_name - ) - df_tmp = data.copy() - for device in df_tmp["device_number"].unique(): - test_copy = df_tmp[df_tmp["device_number"] == device] - for i in range(int(configuration.HOURLY_FORECAST_HORIZON)): - new_row = get_new_row(test_copy, device, forecast_model) - test_copy = pd.concat( - [test_copy, new_row.to_frame().T], ignore_index=True - ) - forecasts = pd.concat([forecasts, test_copy], ignore_index=True) - - forecasts["device_number"] = forecasts["device_number"].astype(int) - forecasts["pm2_5"] = forecasts["pm2_5"].astype(float) - forecasts.rename(columns={"timestamp": "time"}, inplace=True) - forecasts["time"] = pd.to_datetime(forecasts["time"], utc=True) - current_time = datetime.utcnow() - current_time_utc = pd.Timestamp(current_time, tz="UTC") - result = forecasts[fixed_columns + ["time", "pm2_5", "device_number"]][ - forecasts["time"] >= current_time_utc - ] - - return result - @staticmethod - def generate_daily_forecasts(data, project_name, bucket_name, source_blob_name): - data["timestamp"] = pd.to_datetime(data["timestamp"]) - - def get_new_row(df_tmp, device, model): - last_row = df_tmp[df_tmp["device_number"] == device].iloc[-1] - new_row = pd.Series(index=last_row.index, dtype="float64") - for i in fixed_columns: - new_row[i] = last_row[i] - new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(days=1) - new_row["device_number"] = device - new_row[f"pm2_5_last_1_day"] = last_row["pm2_5"] - new_row[f"pm2_5_last_2_day"] = last_row[f"pm2_5_last_{1}_day"] - - shifts = [3, 7, 14, 30] - functions = ["mean", "std", "max", "min"] - for s in shifts: - for f in functions: - if f == "mean": - new_row[f"pm2_5_{f}_{s}_day"] = ( - last_row["pm2_5"] + last_row[f"pm2_5_{f}_{s}_day"] * (s - 1) - ) / s - elif f == "std": - new_row[f"pm2_5_{f}_{s}_day"] = ( - np.sqrt( - (last_row["pm2_5"] - last_row[f"pm2_5_mean_{s}_day"]) - ** 2 - + (last_row[f"pm2_5_{f}_{s}_day"] ** 2 * (s - 1)) - ) - / s - ) - elif f == "max": - new_row[f"pm2_5_{f}_{s}_day"] = max( - last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_day"] - ) - elif f == "min": - new_row[f"pm2_5_{f}_{s}_day"] = min( - last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_day"] - ) - - # Use the date of the new row to create other features - attributes = ["year", "month", "day", "dayofweek"] - for a in attributes: - new_row[a] = new_row["timestamp"].__getattribute__(a) - new_row["week"] = new_row["timestamp"].isocalendar().week + def generate_forecasts(data, project_name, bucket_name, source_blob_name, frequency): + data['timestamp'] = pd.to_datetime(data['timestamp']) + data['pm2_5_lower'] = data['pm2_5_upper'] = data['margin_of_error'] = 0 + + def get_new_row(df, device_id, forecast_model, lower_quantile_model, upper_quantile_model, frequency): + last_row = df[df['device_id'] == device_id].iloc[-1] + new_row = pd.Series(index=last_row.index, dtype='float64') + if frequency == 'hourly': + new_row['timestamp'] = last_row['timestamp'] + pd.Timedelta(hours=1) + new_row['device_id'] = device_id + new_row[f'pm2_5_last_1_hour'] = last_row['pm2_5'] + new_row[f'pm2_5_last_2_hour'] = last_row[f'pm2_5_last_{1}_hour'] + elif frequency == 'daily': + new_row['timestamp'] = last_row['timestamp'] + pd.Timedelta(days=1) + new_row['device_id'] = device_id + new_row[f'pm2_5_last_1_day'] = last_row['pm2_5'] + new_row[f'pm2_5_last_2_day'] = last_row[f'pm2_5_last_{1}_day'] + new_row[f'f"pm2_5_last_3_day'] = last_row[f'pm2_5_last_{2}_day'] + shifts1 = [3, 7, 14] + for s in shifts1: + new_row[f'pm2_5_last_{s}_day'] = df[df['device_id'] == device_id]['pm2_5'].shift(s).iloc[-1] + + shifts2 = [3, 7, 14, 30] + functions = ['mean', 'std', 'max', 'min'] + for s in shifts2: + for f in functions: + if f == 'mean': + new_row[f'pm2_5_{f}_{s}_day'] = (last_row['pm2_5'] + last_row[f'pm2_5_{f}_{s}_day']*(s-1))/s + elif f == 'std': + new_row[f'pm2_5_{f}_{s}_day'] = np.sqrt((last_row['pm2_5'] - last_row[f'pm2_5_mean_{s}_day'])**2 + (last_row[f'pm2_5_{f}_{s}_day']**2*(s-1)))/s + elif f == 'max': + new_row[f'pm2_5_{f}_{s}_day'] = max(last_row['pm2_5'], last_row[f'pm2_5_{f}_{s}_day']) + elif f == 'min': + new_row[f'pm2_5_{f}_{s}_day'] = min(last_row['pm2_5'], last_row[f'pm2_5_{f}_{s}_day']) + attributes = ['year', 'month', 'day', 'dayofweek'] + max_vals = [2023, 12, 31, 6, 23] + if frequency == 'hourly': + attributes.extend(['hour', 'minute']) + max_vals.append([23, 59]) + for a, m in zip(attributes, max_vals): + new_row[a] = new_row['timestamp'].dt.__getattribute__(a) + new_row[a + '_sin'] = np.sin(2 * np.pi * new_row[a] / m) + new_row[a + '_cos'] = np.cos(2 * np.pi * new_row[a] / m) + new_row['week'] = new_row['timestamp'].dt.isocalendar().week + new_row['week_sin'] = np.sin(2 * np.pi * new_row['week'] / 52) + new_row['week_cos'] = np.cos(2 * np.pi * new_row['week'] / 52) + direct_forecast = forecast_model.predict(new_row.drop(['timestamp', 'pm2_5']).values.reshape(1, -1))[0] + new_row['pm2_5_lower'] = lower_quantile_model.predict(new_row.drop(['timestamp', 'pm2_5']).values.reshape(1, -1))[0] + new_row['pm2_5_upper'] = upper_quantile_model.predict(new_row.drop(['timestamp', 'pm2_5']).values.reshape(1, -1))[0] + new_row['margin_of_error'] = (new_row['pm2_5_upper'] - new_row['pm2_5_lower']) / 2 + new_row['pm2_5'] = direct_forecast + new_row['margin_of_error'] - new_row["pm2_5"] = model.predict( - new_row.drop(fixed_columns + ["timestamp", "pm2_5"]).values.reshape( - 1, -1 - ) - )[0] return new_row forecasts = pd.DataFrame() - - forecast_model = get_trained_model_from_gcs( - project_name, bucket_name, source_blob_name - ) - + forecast_model = get_trained_model_from_gcs(project_name, bucket_name, source_blob_name) + lower_quantile_model = get_trained_model_from_gcs(project_name, bucket_name, 'daily_forecast_model_lower_quantile.pkl') + upper_quantile_model = get_trained_model_from_gcs(project_name, bucket_name, 'daily_forecast_model_upper_quantile.pkl') df_tmp = data.copy() - for device in df_tmp["device_number"].unique(): - test_copy = df_tmp[df_tmp["device_number"] == device] - for i in range(int(configuration.DAILY_FORECAST_HORIZON)): - new_row = get_new_row( - test_copy, - device, - forecast_model, - ) - test_copy = pd.concat( - [test_copy, new_row.to_frame().T], ignore_index=True - ) + for device in df_tmp['device_id'].unique(): + test_copy = df_tmp[df_tmp['device_id'] == device] + horizon = configuration.HOURLY_FORECAST_HORIZON if frequency == 'hourly' else configuration.DAILY_FORECAST_HORIZON + for i in range(int(horizon)): + new_row = get_new_row(test_copy, device, forecast_model, lower_quantile_model, upper_quantile_model, frequency) + test_copy = pd.concat([test_copy, new_row.to_frame().T], ignore_index=True) forecasts = pd.concat([forecasts, test_copy], ignore_index=True) - forecasts["device_number"] = forecasts["device_number"].astype(int) - forecasts["pm2_5"] = forecasts["pm2_5"].astype(float) - forecasts.rename(columns={"timestamp": "time"}, inplace=True) - current_time = datetime.utcnow() - current_time_utc = pd.Timestamp(current_time, tz="UTC") - result = forecasts[fixed_columns + ["time", "pm2_5", "device_number"]][ - forecasts["time"] >= current_time_utc - ] + forecasts['pm2_5'] = forecasts['pm2_5'].astype(float) + forecasts['pm2_5_lower'] = forecasts['pm2_5_lower'].astype(float) + forecasts['pm2_5_upper'] = forecasts['pm2_5_upper'].astype(float) + forecasts['margin_of_error'] = forecasts['margin_of_error'].astype(float) + current_time_utc = pd.Timestamp(datetime.utcnow(), tz='UTC') + result = forecasts[['timestamp', 'pm2_5', 'pm2_5_lower', 'pm2_5_upper', 'margin_of_error', 'device_id', 'site_id']][forecasts['timestamp'] >= current_time_utc] return result @staticmethod diff --git a/src/airflow/dags/ml_training_jobs.py b/src/airflow/dags/ml_training_jobs.py index 69eaaa900f..05135dc2dc 100644 --- a/src/airflow/dags/ml_training_jobs.py +++ b/src/airflow/dags/ml_training_jobs.py @@ -29,7 +29,7 @@ def fetch_training_data_for_hourly_forecast_model(): @task() def preprocess_training_data_for_hourly_forecast_model(data): - return ForecastUtils.preprocess__data(data, "hourly") + return ForecastUtils.preprocess_data(data, "hourly") @task() def feature_engineer_training_data_for_hourly_forecast_model(data): @@ -53,7 +53,7 @@ def fetch_training_data_for_daily_forecast_model(): @task() def preprocess_training_data_for_daily_forecast_model(data): - return ForecastUtils.preprocess__data(data, "daily") + return ForecastUtils.preprocess_data(data, "daily") @task() def feature_engineer_data_for_daily_forecast_model(data): From 33ee0e44d2d5ca4e2722df2837923f716fc98eae Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Tue, 22 Aug 2023 12:12:42 +0300 Subject: [PATCH 07/43] setup dags for all jobs --- src/airflow/airflow-requirements.txt | 2 +- src/airflow/airqo_etl_utils/bigquery_api.py | 17 +- src/airflow/airqo_etl_utils/ml_utils.py | 287 ++++++++++++-------- src/airflow/dags/ml_prediction_jobs.py | 51 +--- src/airflow/dags/ml_training_jobs.py | 13 +- src/airflow/dev-requirements.txt | 3 +- src/airflow/requirements.txt | 5 +- 7 files changed, 203 insertions(+), 175 deletions(-) diff --git a/src/airflow/airflow-requirements.txt b/src/airflow/airflow-requirements.txt index 48af8ae3aa..45dd7c89e4 100644 --- a/src/airflow/airflow-requirements.txt +++ b/src/airflow/airflow-requirements.txt @@ -7,4 +7,4 @@ lightgbm mlflow gcsfs pymongo -category-encoders \ No newline at end of file +optuna \ No newline at end of file diff --git a/src/airflow/airqo_etl_utils/bigquery_api.py b/src/airflow/airqo_etl_utils/bigquery_api.py index 360fb74df0..1d720b6772 100644 --- a/src/airflow/airqo_etl_utils/bigquery_api.py +++ b/src/airflow/airqo_etl_utils/bigquery_api.py @@ -615,22 +615,7 @@ def fetch_raw_readings(self) -> pd.DataFrame: except Exception as e: raise e - def fetch_data(self, start_date_time: str, historical: bool = False): - # historical is for the actual jobs, not training - query = f""" - SELECT DISTINCT timestamp as created_at, {"site_id," if historical else ""} device_number, pm2_5_calibrated_value as pm2_5 - FROM `{self.hourly_measurements_table_prod}` - WHERE DATE(timestamp) >= '{start_date_time}' and device_number IS NOT NULL - ORDER BY created_at, device_number - """ - - job_config = bigquery.QueryJobConfig() - job_config.use_query_cache = True - - df = self.client.query(f"{query}", job_config).result().to_dataframe() - return df - - def fetch_training_data( + def fetch_data( self, start_date_time: str, ) -> pd.DataFrame: diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index d900eac7e2..68944efb59 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -64,9 +64,9 @@ class ForecastUtils: @staticmethod def preprocess_data(data, frequency): data["timestamp"] = pd.to_datetime(data["timestamp"]) - data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])["pm2_5"].transform( - lambda x: x.interpolate(method="linear", limit_direction="both") - ) + data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])[ + "pm2_5" + ].transform(lambda x: x.interpolate(method="linear", limit_direction="both")) if frequency == "daily": data = ( data.groupby(["device_id", "site_id", "device_category"]) @@ -74,7 +74,9 @@ def preprocess_data(data, frequency): .mean(numeric_only=True) ) data.reset_index(inplace=True) - data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])["pm2_5"].transform( + data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])[ + "pm2_5" + ].transform( lambda x: x.interpolate(method="linear", limit_direction="both") ) data = data.dropna(subset=["pm2_5"]) @@ -122,7 +124,7 @@ def get_lag_features(df, target_col, freq): return df - def encode_categorical_features(df): + def encode_categorical_features(df, frequency): columns = ["device_id", "site_id", "device_category"] mappings = [] for col in columns: @@ -136,13 +138,13 @@ def encode_categorical_features(df): mappings.append(mapping) for i, col in enumerate(columns): upload_mapping_to_gcs( - mappings[i], project_id, bucket, f"{col}_mapping.json" + mappings[i], project_id, bucket, f"{frequency}_{col}_mapping.json" ) return df def get_time_and_cyclic_features(df, freq): attributes = ["year", "month", "day", "dayofweek"] - max_vals = [2023, 12, 31, 6, 23] + max_vals = [2023, 12, 30, 7] if freq == "hourly": attributes.extend(["hour", "minute"]) max_vals.append([23, 59]) @@ -157,14 +159,21 @@ def get_time_and_cyclic_features(df, freq): df.drop(columns=attributes + ["week"], inplace=True) return df - def decode_categorical_features(df): + def decode_categorical_features(df, frequency): columns = ["device_id", "site_id", "device_category"] for col in columns: - mapping = get_mapping_from_gcs( - project_id, bucket, f"{col}_mapping.json" + if frequency == "hourly": + mapping = get_mapping_from_gcs( + project_id, bucket, f"hourly_{col}_mapping.json" + ) + elif frequency == "daily": + mapping = get_mapping_from_gcs( + project_id, bucket, f"daily_{col}_mapping.json" ) + df[col] = df[col].map(mapping) return df + data["timestamp"] = pd.to_datetime(data["timestamp"]) df_tmp = get_lag_features(data, target_column, frequency) df_tmp = get_time_and_cyclic_features(df_tmp, frequency) @@ -215,7 +224,6 @@ def train_and_save_forecast_models(train, frequency): test_data[target_col], ) - sampler = optuna.samplers.TPESampler() pruner = optuna.pruners.SuccessiveHalvingPruner( min_resource=10, reduction_factor=2, min_early_stopping_rate=0 @@ -260,12 +268,13 @@ def objective(trial): study.optimize(objective, n_trials=15) - mlflow.set_tracking_uri(configuration.MLFLOW_TRACKING_URI) mlflow.set_experiment(f"{frequency}_forecast_model_{environment}") registered_model_name = f"{frequency}_forecast_model_{environment}" - mlflow.lightgbm.autolog(registered_model_name=registered_model_name, log_datasets=False) + mlflow.lightgbm.autolog( + registered_model_name=registered_model_name, log_datasets=False + ) with mlflow.start_run(): best_params = study.best_params print(f"Best params are {best_params}") @@ -289,127 +298,181 @@ def objective(trial): callbacks=[early_stopping(stopping_rounds=150)], ) - # train quantile regression models for 0.025 and 0.975 quantiles - clf_025 = LGBMRegressor( - n_estimators=best_params["n_estimators"], - learning_rate=best_params["learning_rate"], - colsample_bytree=best_params["colsample_bytree"], - reg_alpha=best_params["reg_alpha"], - reg_lambda=best_params["reg_lambda"], - max_depth=best_params["max_depth"], - random_state=42, - verbosity=2, - objective="quantile", - alpha=0.025, - metric="quantile", - ) - - clf_025.fit( - train_data[features], - train_target, - eval_set=[(test_data[features], test_target)], - categorical_feature=["device_id", "site_id", "device_category"], - ) - - clf_975 = LGBMRegressor( - n_estimators=best_params["n_estimators"], - learning_rate=best_params["learning_rate"], - colsample_bytree=best_params["colsample_bytree"], - reg_alpha=best_params["reg_alpha"], - reg_lambda=best_params["reg_lambda"], - max_depth=best_params["max_depth"], - random_state=42, - verbosity=2, - objective="quantile", - alpha=0.975, - metric="quantile", - ) + upload_trained_model_to_gcs( + clf, project_id, bucket, f"{frequency}_forecast_model.pkl" + ) - clf_975.fit( - train_data[features], - train_target, - eval_set=[(test_data[features], test_target)], - categorical_feature=["device_id", "site_id", "device_category"], - ) + alphas = [0.025, 0.975] + models = [] + names = [f'{frequency}_lower_quantile_model', f'{frequency}_upper_quantile_model'] - upload_trained_model_to_gcs( - clf, project_id, bucket, "hourly_forecast_model.pkl" + for alpha in alphas: + clf = LGBMRegressor( + n_estimators=best_params["n_estimators"], + learning_rate=best_params["learning_rate"], + colsample_bytree=best_params["colsample_bytree"], + reg_alpha=best_params["reg_alpha"], + reg_lambda=best_params["reg_lambda"], + max_depth=best_params["max_depth"], + random_state=42, + verbosity=2, + objective="quantile", + alpha=alpha, + metric="quantile", ) + clf.fit( + train_data[features], + train_target, + eval_set=[(test_data[features], test_target)], + categorical_feature=["device_id", "site_id", "device_category"], + ) + models.append(clf) + for n, m in zip(names, models): + upload_trained_model_to_gcs( + m, project_id, bucket, f"{n}.pkl" + ) + @staticmethod - def generate_forecasts(data, project_name, bucket_name, source_blob_name, frequency): - data['timestamp'] = pd.to_datetime(data['timestamp']) - data['pm2_5_lower'] = data['pm2_5_upper'] = data['margin_of_error'] = 0 - - def get_new_row(df, device_id, forecast_model, lower_quantile_model, upper_quantile_model, frequency): - last_row = df[df['device_id'] == device_id].iloc[-1] - new_row = pd.Series(index=last_row.index, dtype='float64') - if frequency == 'hourly': - new_row['timestamp'] = last_row['timestamp'] + pd.Timedelta(hours=1) - new_row['device_id'] = device_id - new_row[f'pm2_5_last_1_hour'] = last_row['pm2_5'] - new_row[f'pm2_5_last_2_hour'] = last_row[f'pm2_5_last_{1}_hour'] - elif frequency == 'daily': - new_row['timestamp'] = last_row['timestamp'] + pd.Timedelta(days=1) - new_row['device_id'] = device_id - new_row[f'pm2_5_last_1_day'] = last_row['pm2_5'] - new_row[f'pm2_5_last_2_day'] = last_row[f'pm2_5_last_{1}_day'] - new_row[f'f"pm2_5_last_3_day'] = last_row[f'pm2_5_last_{2}_day'] + def generate_forecasts( + data, project_name, bucket_name, frequency + ): + data["timestamp"] = pd.to_datetime(data["timestamp"]) + data["pm2_5_lower"] = data["pm2_5_upper"] = data["margin_of_error"] = 0 + + def get_new_row( + df, + device_id, + forecast_model, + lower_quantile_model, + upper_quantile_model, + frequency, + ): + last_row = df[df["device_id"] == device_id].iloc[-1] + new_row = pd.Series(index=last_row.index, dtype="float64") + if frequency == "hourly": + new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(hours=1) + new_row["device_id"] = device_id + new_row[f"pm2_5_last_1_hour"] = last_row["pm2_5"] + new_row[f"pm2_5_last_2_hour"] = last_row[f"pm2_5_last_{1}_hour"] + elif frequency == "daily": + new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(days=1) + new_row["device_id"] = device_id + new_row[f"pm2_5_last_1_day"] = last_row["pm2_5"] + new_row[f"pm2_5_last_2_day"] = last_row[f"pm2_5_last_{1}_day"] + new_row[f'f"pm2_5_last_3_day'] = last_row[f"pm2_5_last_{2}_day"] shifts1 = [3, 7, 14] for s in shifts1: - new_row[f'pm2_5_last_{s}_day'] = df[df['device_id'] == device_id]['pm2_5'].shift(s).iloc[-1] + new_row[f"pm2_5_last_{s}_day"] = ( + df[df["device_id"] == device_id]["pm2_5"].shift(s).iloc[-1] + ) shifts2 = [3, 7, 14, 30] - functions = ['mean', 'std', 'max', 'min'] + functions = ["mean", "std", "max", "min"] for s in shifts2: for f in functions: - if f == 'mean': - new_row[f'pm2_5_{f}_{s}_day'] = (last_row['pm2_5'] + last_row[f'pm2_5_{f}_{s}_day']*(s-1))/s - elif f == 'std': - new_row[f'pm2_5_{f}_{s}_day'] = np.sqrt((last_row['pm2_5'] - last_row[f'pm2_5_mean_{s}_day'])**2 + (last_row[f'pm2_5_{f}_{s}_day']**2*(s-1)))/s - elif f == 'max': - new_row[f'pm2_5_{f}_{s}_day'] = max(last_row['pm2_5'], last_row[f'pm2_5_{f}_{s}_day']) - elif f == 'min': - new_row[f'pm2_5_{f}_{s}_day'] = min(last_row['pm2_5'], last_row[f'pm2_5_{f}_{s}_day']) - attributes = ['year', 'month', 'day', 'dayofweek'] + if f == "mean": + new_row[f"pm2_5_{f}_{s}_day"] = ( + last_row["pm2_5"] + + last_row[f"pm2_5_{f}_{s}_day"] * (s - 1) + ) / s + elif f == "std": + new_row[f"pm2_5_{f}_{s}_day"] = ( + np.sqrt( + ( + last_row["pm2_5"] + - last_row[f"pm2_5_mean_{s}_day"] + ) + ** 2 + + (last_row[f"pm2_5_{f}_{s}_day"] ** 2 * (s - 1)) + ) + / s + ) + elif f == "max": + new_row[f"pm2_5_{f}_{s}_day"] = max( + last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_day"] + ) + elif f == "min": + new_row[f"pm2_5_{f}_{s}_day"] = min( + last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_day"] + ) + attributes = ["year", "month", "day", "dayofweek"] max_vals = [2023, 12, 31, 6, 23] - if frequency == 'hourly': - attributes.extend(['hour', 'minute']) + if frequency == "hourly": + attributes.extend(["hour", "minute"]) max_vals.append([23, 59]) for a, m in zip(attributes, max_vals): - new_row[a] = new_row['timestamp'].dt.__getattribute__(a) - new_row[a + '_sin'] = np.sin(2 * np.pi * new_row[a] / m) - new_row[a + '_cos'] = np.cos(2 * np.pi * new_row[a] / m) - new_row['week'] = new_row['timestamp'].dt.isocalendar().week - new_row['week_sin'] = np.sin(2 * np.pi * new_row['week'] / 52) - new_row['week_cos'] = np.cos(2 * np.pi * new_row['week'] / 52) - direct_forecast = forecast_model.predict(new_row.drop(['timestamp', 'pm2_5']).values.reshape(1, -1))[0] - new_row['pm2_5_lower'] = lower_quantile_model.predict(new_row.drop(['timestamp', 'pm2_5']).values.reshape(1, -1))[0] - new_row['pm2_5_upper'] = upper_quantile_model.predict(new_row.drop(['timestamp', 'pm2_5']).values.reshape(1, -1))[0] - new_row['margin_of_error'] = (new_row['pm2_5_upper'] - new_row['pm2_5_lower']) / 2 - new_row['pm2_5'] = direct_forecast + new_row['margin_of_error'] + new_row[a] = new_row["timestamp"].dt.__getattribute__(a) + new_row[a + "_sin"] = np.sin(2 * np.pi * new_row[a] / m) + new_row[a + "_cos"] = np.cos(2 * np.pi * new_row[a] / m) + new_row["week"] = new_row["timestamp"].dt.isocalendar().week + new_row["week_sin"] = np.sin(2 * np.pi * new_row["week"] / 52) + new_row["week_cos"] = np.cos(2 * np.pi * new_row["week"] / 52) + direct_forecast = forecast_model.predict( + new_row.drop(["timestamp", "pm2_5"]).values.reshape(1, -1) + )[0] + new_row["pm2_5_lower"] = lower_quantile_model.predict( + new_row.drop(["timestamp", "pm2_5"]).values.reshape(1, -1) + )[0] + new_row["pm2_5_upper"] = upper_quantile_model.predict( + new_row.drop(["timestamp", "pm2_5"]).values.reshape(1, -1) + )[0] + new_row["margin_of_error"] = ( + new_row["pm2_5_upper"] - new_row["pm2_5_lower"] + ) / 2 + new_row["pm2_5"] = direct_forecast + new_row["margin_of_error"] return new_row forecasts = pd.DataFrame() - forecast_model = get_trained_model_from_gcs(project_name, bucket_name, source_blob_name) - lower_quantile_model = get_trained_model_from_gcs(project_name, bucket_name, 'daily_forecast_model_lower_quantile.pkl') - upper_quantile_model = get_trained_model_from_gcs(project_name, bucket_name, 'daily_forecast_model_upper_quantile.pkl') + forecast_model = get_trained_model_from_gcs( + project_name, bucket_name, f"{frequency}_forecast_model.pkl" + ) + lower_quantile_model = get_trained_model_from_gcs( + project_name, bucket_name, f"{frequency}_lower_quantile_model.pkl" + ) + upper_quantile_model = get_trained_model_from_gcs( + project_name, bucket_name, f"{frequency}_upper_quantile_model.pkl" + ) df_tmp = data.copy() - for device in df_tmp['device_id'].unique(): - test_copy = df_tmp[df_tmp['device_id'] == device] - horizon = configuration.HOURLY_FORECAST_HORIZON if frequency == 'hourly' else configuration.DAILY_FORECAST_HORIZON + for device in df_tmp["device_id"].unique(): + test_copy = df_tmp[df_tmp["device_id"] == device] + horizon = ( + configuration.HOURLY_FORECAST_HORIZON + if frequency == "hourly" + else configuration.DAILY_FORECAST_HORIZON + ) for i in range(int(horizon)): - new_row = get_new_row(test_copy, device, forecast_model, lower_quantile_model, upper_quantile_model, frequency) - test_copy = pd.concat([test_copy, new_row.to_frame().T], ignore_index=True) + new_row = get_new_row( + test_copy, + device, + forecast_model, + lower_quantile_model, + upper_quantile_model, + frequency, + ) + test_copy = pd.concat( + [test_copy, new_row.to_frame().T], ignore_index=True + ) forecasts = pd.concat([forecasts, test_copy], ignore_index=True) - forecasts['pm2_5'] = forecasts['pm2_5'].astype(float) - forecasts['pm2_5_lower'] = forecasts['pm2_5_lower'].astype(float) - forecasts['pm2_5_upper'] = forecasts['pm2_5_upper'].astype(float) - forecasts['margin_of_error'] = forecasts['margin_of_error'].astype(float) - current_time_utc = pd.Timestamp(datetime.utcnow(), tz='UTC') - result = forecasts[['timestamp', 'pm2_5', 'pm2_5_lower', 'pm2_5_upper', 'margin_of_error', 'device_id', 'site_id']][forecasts['timestamp'] >= current_time_utc] + forecasts["pm2_5"] = forecasts["pm2_5"].astype(float) + forecasts["pm2_5_lower"] = forecasts["pm2_5_lower"].astype(float) + forecasts["pm2_5_upper"] = forecasts["pm2_5_upper"].astype(float) + forecasts["margin_of_error"] = forecasts["margin_of_error"].astype(float) + current_time_utc = pd.Timestamp(datetime.utcnow(), tz="UTC") + forecasts.rename(columns={"timestamp": "time"}, inplace=True) + result = forecasts[ + [ + "timestamp", + "pm2_5", + "pm2_5_lower", + "pm2_5_upper", + "margin_of_error", + "device_id", + "site_id", + ] + ][forecasts["time"] >= current_time_utc] return result @staticmethod diff --git a/src/airflow/dags/ml_prediction_jobs.py b/src/airflow/dags/ml_prediction_jobs.py index 2f48d19d68..f90233afa4 100644 --- a/src/airflow/dags/ml_prediction_jobs.py +++ b/src/airflow/dags/ml_prediction_jobs.py @@ -27,25 +27,19 @@ def get_historical_data_for_hourly_forecasts(): from airqo_etl_utils.date import date_to_str start_date = date_to_str(start_date, str_format="%Y-%m-%d") - return BigQueryApi().fetch_data(start_date, historical=True) + return BigQueryApi().fetch_data(start_date) @task() def preprocess_historical_data_hourly_forecast(data): - return ForecastUtils.preprocess_historical_data(data, "hourly") - - @task() - def add_lag_features_historical_data_hourly_forecast(data): - return ForecastUtils.get_lag_features(data, "pm2_5", frequency="hourly") + return ForecastUtils.preprocess_data(data, "hourly") @task - def add_timestep_features_historical_data_hourly_forecasts(data): - return ForecastUtils.get_time_features(data, frequency="hourly") + def feature_eng_hourly_historical_data(data): + return ForecastUtils.feature_eng_data(data, 'pm2_5', 'hourly', 'predict') @task() def make_hourly_forecasts(data): - return ForecastUtils.generate_hourly_forecasts( - data, project_id, bucket, "hourly_forecast_model.pkl" - ) + return ForecastUtils.generate_forecasts(data=data, project_name=project_id, bucket_name= bucket,frequency='hourly') @task() def save_hourly_forecasts_to_bigquery(data): @@ -67,25 +61,20 @@ def get_historical_data_for_daily_forecasts(): days=int(configuration.DAILY_FORECAST_PREDICTION_JOB_SCOPE) ) start_date = date_to_str(start_date, str_format="%Y-%m-%d") - return BigQueryApi().fetch_data(start_date, historical=True) + return BigQueryApi().fetch_data(start_date) @task() def preprocess_historical_data_daily_forecast(data): - return ForecastUtils.preprocess_historical_data(data, "daily") + return ForecastUtils.preprocess_data(data, "daily") @task() - def add_lag_features_historical_data_daily_forecast(data): - return ForecastUtils.get_lag_features(data, "pm2_5", frequency="daily") - - @task() - def add_timestep_features_historical_data_daily_forecast(data): - return ForecastUtils.get_time_features(data, "daily") + def feature_engineer_daily_historical_data(data): + return ForecastUtils.feature_eng_data(data, 'pm2_5', 'daily', 'predict') @task() def make_daily_forecasts(data): - return ForecastUtils.generate_daily_forecasts( - data, project_id, bucket, "daily_forecast_model.pkl" - ) + return ForecastUtils.generate_forecasts(data, project_id, bucket, 'daily') + @task() def save_daily_forecasts_to_bigquery(data): @@ -99,25 +88,15 @@ def save_daily_forecasts_to_mongo(data): hourly_data = get_historical_data_for_hourly_forecasts() preprocessed_hourly_data = preprocess_historical_data_hourly_forecast(hourly_data) - lagged_hourly_data = add_lag_features_historical_data_hourly_forecast( - preprocessed_hourly_data - ) - time_features_hourly_data = add_timestep_features_historical_data_hourly_forecasts( - lagged_hourly_data - ) - hourly_forecasts = make_hourly_forecasts(time_features_hourly_data) + feat_data = feature_eng_hourly_historical_data(preprocessed_hourly_data) + hourly_forecasts = make_hourly_forecasts(feat_data) save_hourly_forecasts_to_bigquery(hourly_forecasts) save_hourly_forecasts_to_mongo(hourly_forecasts) daily_data = get_historical_data_for_daily_forecasts() preprocessed_daily_data = preprocess_historical_data_daily_forecast(daily_data) - lagged_daily_data = add_lag_features_historical_data_daily_forecast( - preprocessed_daily_data - ) - time_features_daily_data = add_timestep_features_historical_data_daily_forecast( - lagged_daily_data - ) - daily_forecasts = make_daily_forecasts(time_features_daily_data) + feat_data = feature_engineer_daily_historical_data(preprocessed_daily_data) + daily_forecasts = make_daily_forecasts(feat_data) save_daily_forecasts_to_bigquery(daily_forecasts) save_daily_forecasts_to_mongo(daily_forecasts) diff --git a/src/airflow/dags/ml_training_jobs.py b/src/airflow/dags/ml_training_jobs.py index 05135dc2dc..32287ee59c 100644 --- a/src/airflow/dags/ml_training_jobs.py +++ b/src/airflow/dags/ml_training_jobs.py @@ -15,6 +15,8 @@ tags=["airqo", "hourly-forecast", "daily-forecast", "training-job"], ) def train_forecasting_models(): + + # Hourly forecast tasks @task() def fetch_training_data_for_hourly_forecast_model(): from dateutil.relativedelta import relativedelta @@ -26,19 +28,20 @@ def fetch_training_data_for_hourly_forecast_model(): ) start_date = date_to_str(start_date, str_format="%Y-%m-%d") return BigQueryApi().fetch_data(start_date) - @task() def preprocess_training_data_for_hourly_forecast_model(data): return ForecastUtils.preprocess_data(data, "hourly") @task() def feature_engineer_training_data_for_hourly_forecast_model(data): - return ForecastUtils.feature_eng_training_data(data, "pm2_5", "hourly") + return ForecastUtils.feature_eng_data(data, "pm2_5", "hourly", "train") @task() def train_and_save_hourly_forecast_model(train_data): - return ForecastUtils.train_and_save_hourly_forecast_model(train_data) + return ForecastUtils.train_and_save_forecast_models(train_data, frequency='hourly') + +# Daily forecast tasks @task() def fetch_training_data_for_daily_forecast_model(): from dateutil.relativedelta import relativedelta @@ -57,11 +60,11 @@ def preprocess_training_data_for_daily_forecast_model(data): @task() def feature_engineer_data_for_daily_forecast_model(data): - return ForecastUtils.feature_eng_training_data(data, "pm2_5", "daily") + return ForecastUtils.feature_eng_data(data, "pm2_5", "daily", "train") @task() def train_and_save_daily_model(train_data): - return ForecastUtils.train_and_save_daily_forecast_model(train_data) + return ForecastUtils.train_and_save_forecast_models(train_data, "daily") hourly_data = fetch_training_data_for_hourly_forecast_model() hourly_data = preprocess_training_data_for_hourly_forecast_model(hourly_data) diff --git a/src/airflow/dev-requirements.txt b/src/airflow/dev-requirements.txt index 59d0561bea..81c23b0562 100644 --- a/src/airflow/dev-requirements.txt +++ b/src/airflow/dev-requirements.txt @@ -18,5 +18,4 @@ mlflow lightgbm gcsfs pymongo -pytest -category_encoders \ No newline at end of file +pytest \ No newline at end of file diff --git a/src/airflow/requirements.txt b/src/airflow/requirements.txt index 947c051adc..7396bc939c 100644 --- a/src/airflow/requirements.txt +++ b/src/airflow/requirements.txt @@ -16,7 +16,6 @@ joblib~=1.3.1 scikit-learn~=1.3.0 gcsfs pymongo~=4.4.1 - +optuna pytest~=7.4.0 -scipy~=1.11.1 -category_encoders \ No newline at end of file +scipy~=1.11.1 \ No newline at end of file From b5a5ac66f51757813534ec29827c9b312f157da8 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Tue, 22 Aug 2023 18:34:24 +0300 Subject: [PATCH 08/43] cleans up training job code --- src/airflow/airqo_etl_utils/ml_utils.py | 58 +++++++++++++------------ 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index 68944efb59..719f136228 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -85,10 +85,11 @@ def preprocess_data(data, frequency): @staticmethod def feature_eng_data(data, target_column, frequency, job_type): def get_lag_features(df, target_col, freq): + df1 = df.copy() if freq == "daily": shifts = [1, 2, 3, 7, 14] for s in shifts: - df[f"pm2_5_last_{s}_day"] = df.groupby(["device_id"])[ + df1[f"pm2_5_last_{s}_day"] = df1.groupby(["device_id"])[ target_col ].shift(s) @@ -96,8 +97,8 @@ def get_lag_features(df, target_col, freq): functions = ["mean", "std", "max", "min"] for s in shifts: for f in functions: - df[f"pm2_5_{f}_{s}_day"] = ( - df.groupby(["device_id"])[target_col] + df1[f"pm2_5_{f}_{s}_day"] = ( + df1.groupby(["device_id"])[target_col] .shift(1) .rolling(s) .agg(f) @@ -105,7 +106,7 @@ def get_lag_features(df, target_col, freq): elif freq == "hourly": shifts = [1, 2, 6, 12] for s in shifts: - df[f"pm2_5_last_{s}_hour"] = df.groupby(["device_id"])[ + df1[f"pm2_5_last_{s}_hour"] = df1.groupby(["device_id"])[ target_col ].shift(s) @@ -113,8 +114,8 @@ def get_lag_features(df, target_col, freq): functions = ["mean", "std", "median", "skew"] for s in shifts: for f in functions: - df[f"pm2_5_{f}_{s}_hour"] = ( - df.groupby(["device_id"])[target_col] + df1[f"pm2_5_{f}_{s}_hour"] = ( + df1.groupby(["device_id"])[target_col] .shift(1) .rolling(s) .agg(f) @@ -122,42 +123,44 @@ def get_lag_features(df, target_col, freq): else: raise ValueError("Invalid frequency") - return df + return df1 def encode_categorical_features(df, frequency): + df1 = df.copy() columns = ["device_id", "site_id", "device_category"] mappings = [] for col in columns: mapping = {} - for val in df[col].unique(): + for val in df1[col].unique(): num = random.randint(0, 10000) while num in mapping.values(): num = random.randint(0, 10000) mapping[val] = num - df[col] = df[col].map(mapping) + df1[col] = df1[col].map(mapping) mappings.append(mapping) for i, col in enumerate(columns): upload_mapping_to_gcs( mappings[i], project_id, bucket, f"{frequency}_{col}_mapping.json" ) - return df + return df1 def get_time_and_cyclic_features(df, freq): + df1 = df.copy() attributes = ["year", "month", "day", "dayofweek"] max_vals = [2023, 12, 30, 7] if freq == "hourly": - attributes.extend(["hour", "minute"]) - max_vals.append([23, 59]) + attributes.append("hour") + max_vals.append(23) for a, m in zip(attributes, max_vals): - df[a] = df["timestamp"].dt.__getattribute__(a) - df[a + "_sin"] = np.sin(2 * np.pi * df[a] / m) - df[a + "_cos"] = np.cos(2 * np.pi * df[a] / m) - - df["week"] = df["timestamp"].dt.isocalendar().week - df["week_sin"] = np.sin(2 * np.pi * df["week"] / 52) - df["week_cos"] = np.cos(2 * np.pi * df["week"] / 52) - df.drop(columns=attributes + ["week"], inplace=True) - return df + df1[a] = df1["timestamp"].dt.__getattribute__(a) + df1[a + "_sin"] = np.sin(2 * np.pi * df1[a] / m) + df1[a + "_cos"] = np.cos(2 * np.pi * df1[a] / m) + + df1["week"] = df1["timestamp"].dt.isocalendar().week + df1["week_sin"] = np.sin(2 * np.pi * df1["week"] / 52) + df1["week_cos"] = np.cos(2 * np.pi * df1["week"] / 52) + df1.drop(columns=attributes + ["week"], inplace=True) + return df1 def decode_categorical_features(df, frequency): columns = ["device_id", "site_id", "device_category"] @@ -174,11 +177,12 @@ def decode_categorical_features(df, frequency): df[col] = df[col].map(mapping) return df - data["timestamp"] = pd.to_datetime(data["timestamp"]) - df_tmp = get_lag_features(data, target_column, frequency) + df_tmp = data.copy() + df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"]) + df_tmp = get_lag_features(df_tmp, target_column, frequency) df_tmp = get_time_and_cyclic_features(df_tmp, frequency) if job_type == "train": - df_tmp = encode_categorical_features(df_tmp) + df_tmp = encode_categorical_features(df_tmp, frequency) elif job_type == "predict": df_tmp = decode_categorical_features(df_tmp) @@ -199,9 +203,9 @@ def train_and_save_forecast_models(train, frequency): months = device_df["timestamp"].dt.month.unique() train_months = val_months = test_months = [] if frequency == "hourly": - train_months = months[:8] - val_months = months[9] - test_months = months[10] + train_months = months[:4] + val_months = months[4:5] + test_months = months[5:] elif frequency == "daily": train_months = months[:8] val_months = months[8:9] From 255a0cd05c837ab283e2c542d3d0ef43d075d9b2 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Wed, 23 Aug 2023 01:14:53 +0300 Subject: [PATCH 09/43] refactor prediction job --- src/airflow/airqo_etl_utils/ml_utils.py | 193 ++++++++++++------------ 1 file changed, 94 insertions(+), 99 deletions(-) diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index 719f136228..c13ba9d7c8 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -59,6 +59,20 @@ def get_mapping_from_gcs(project_name, bucket_name, source_blob_name): mapping_dict = json.load(f) return mapping_dict +def decode_categorical_features(df, frequency): + columns = ["device_id", "site_id", "device_category"] + for col in columns: + if frequency == "hourly": + mapping = get_mapping_from_gcs( + project_id, bucket, f"hourly_{col}_mapping.json" + ) + elif frequency == "daily": + mapping = get_mapping_from_gcs( + project_id, bucket, f"daily_{col}_mapping.json" + ) + + df[col] = df[col].map(mapping) + return df class ForecastUtils: @staticmethod @@ -162,20 +176,7 @@ def get_time_and_cyclic_features(df, freq): df1.drop(columns=attributes + ["week"], inplace=True) return df1 - def decode_categorical_features(df, frequency): - columns = ["device_id", "site_id", "device_category"] - for col in columns: - if frequency == "hourly": - mapping = get_mapping_from_gcs( - project_id, bucket, f"hourly_{col}_mapping.json" - ) - elif frequency == "daily": - mapping = get_mapping_from_gcs( - project_id, bucket, f"daily_{col}_mapping.json" - ) - - df[col] = df[col].map(mapping) - return df + df_tmp = data.copy() df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"]) @@ -184,7 +185,11 @@ def decode_categorical_features(df, frequency): if job_type == "train": df_tmp = encode_categorical_features(df_tmp, frequency) elif job_type == "predict": - df_tmp = decode_categorical_features(df_tmp) + df_tmp = decode_categorical_features(df_tmp, frequency) + #convert the categorical columns to int + df_tmp['device_id'] = df_tmp['device_id'].astype(int) + df_tmp['site_id'] = df_tmp['site_id'].astype(int) + df_tmp['device_category'] = df_tmp['device_category'].astype(int) return df_tmp @@ -344,89 +349,77 @@ def generate_forecasts( data["timestamp"] = pd.to_datetime(data["timestamp"]) data["pm2_5_lower"] = data["pm2_5_upper"] = data["margin_of_error"] = 0 - def get_new_row( + def get_forecasts( df, - device_id, forecast_model, lower_quantile_model, upper_quantile_model, frequency, + horizon ): - last_row = df[df["device_id"] == device_id].iloc[-1] - new_row = pd.Series(index=last_row.index, dtype="float64") - if frequency == "hourly": - new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(hours=1) - new_row["device_id"] = device_id - new_row[f"pm2_5_last_1_hour"] = last_row["pm2_5"] - new_row[f"pm2_5_last_2_hour"] = last_row[f"pm2_5_last_{1}_hour"] - elif frequency == "daily": - new_row["timestamp"] = last_row["timestamp"] + pd.Timedelta(days=1) - new_row["device_id"] = device_id - new_row[f"pm2_5_last_1_day"] = last_row["pm2_5"] - new_row[f"pm2_5_last_2_day"] = last_row[f"pm2_5_last_{1}_day"] - new_row[f'f"pm2_5_last_3_day'] = last_row[f"pm2_5_last_{2}_day"] - shifts1 = [3, 7, 14] - for s in shifts1: - new_row[f"pm2_5_last_{s}_day"] = ( - df[df["device_id"] == device_id]["pm2_5"].shift(s).iloc[-1] - ) - - shifts2 = [3, 7, 14, 30] - functions = ["mean", "std", "max", "min"] - for s in shifts2: - for f in functions: - if f == "mean": - new_row[f"pm2_5_{f}_{s}_day"] = ( - last_row["pm2_5"] - + last_row[f"pm2_5_{f}_{s}_day"] * (s - 1) - ) / s - elif f == "std": - new_row[f"pm2_5_{f}_{s}_day"] = ( - np.sqrt( - ( - last_row["pm2_5"] - - last_row[f"pm2_5_mean_{s}_day"] - ) - ** 2 - + (last_row[f"pm2_5_{f}_{s}_day"] ** 2 * (s - 1)) - ) - / s - ) - elif f == "max": - new_row[f"pm2_5_{f}_{s}_day"] = max( - last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_day"] - ) - elif f == "min": - new_row[f"pm2_5_{f}_{s}_day"] = min( - last_row["pm2_5"], last_row[f"pm2_5_{f}_{s}_day"] - ) - attributes = ["year", "month", "day", "dayofweek"] - max_vals = [2023, 12, 31, 6, 23] - if frequency == "hourly": - attributes.extend(["hour", "minute"]) - max_vals.append([23, 59]) - for a, m in zip(attributes, max_vals): - new_row[a] = new_row["timestamp"].dt.__getattribute__(a) - new_row[a + "_sin"] = np.sin(2 * np.pi * new_row[a] / m) - new_row[a + "_cos"] = np.cos(2 * np.pi * new_row[a] / m) - new_row["week"] = new_row["timestamp"].dt.isocalendar().week - new_row["week_sin"] = np.sin(2 * np.pi * new_row["week"] / 52) - new_row["week_cos"] = np.cos(2 * np.pi * new_row["week"] / 52) - direct_forecast = forecast_model.predict( - new_row.drop(["timestamp", "pm2_5"]).values.reshape(1, -1) - )[0] - new_row["pm2_5_lower"] = lower_quantile_model.predict( - new_row.drop(["timestamp", "pm2_5"]).values.reshape(1, -1) - )[0] - new_row["pm2_5_upper"] = upper_quantile_model.predict( - new_row.drop(["timestamp", "pm2_5"]).values.reshape(1, -1) - )[0] - new_row["margin_of_error"] = ( - new_row["pm2_5_upper"] - new_row["pm2_5_lower"] - ) / 2 - new_row["pm2_5"] = direct_forecast + new_row["margin_of_error"] - - return new_row + """This method generates forecasts for a given device dataframe basing on horizon provided""" + df_tmp = df.copy() + for i in range(int(horizon)): + df_tmp = pd.concat([df_tmp, df.iloc[-1]], ignore_index=True) + similar_columns = ['site_id', 'device_id', 'device_category', 'latitude', 'longitude'] + for col in similar_columns: + df_tmp.iloc[-1, df_tmp.columns.get_loc(col)] = df_tmp.iloc[-2, df_tmp.columns.get_loc(col)] + + #daily frequency + if frequency == 'daily': + df_tmp.iloc[-1, df_tmp.columns.get_loc('timestamp')] = df.iloc[-2, df_tmp.columns.get_loc('timestamp')] + pd.Timedelta(days=1) + + #lag features + shifts1 = [1,2,3,7,14] + for s in shifts1: + df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_day")] = df_tmp['pm2_5'].shift(s) + + #rolling features + shifts2 = [2,3,7,14] + functions = ['mean', 'std', 'max', 'min'] + for s in shifts2: + for f in functions: + df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_day")] = df_tmp['pm2_5'].shift(1).rolling(s).agg(f) + + # hourly frequency + elif frequency == 'hourly': + df_tmp.iloc[-1, df_tmp.columns.get_loc('timestamp')] = df.iloc[-2, df_tmp.columns.get_loc('timestamp')] + pd.Timedelta(hours=1) + + #lag features + shifts1 = [1,2,6,12] + for s in shifts1: + df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_hour")] = df_tmp['pm2_5'].shift(s) + + #rolling features + shifts2 = [3,6,12,24] + functions = ['mean', 'std', 'median', 'skew'] + for s in shifts2: + for f in functions: + df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_hour")] = df_tmp['pm2_5'].shift(1).rolling(s).agg(f) + + #time and cyclic features + attributes = ['year', 'month', 'day', 'dayofweek'] + max_vals = [2023, 12, 30, 7] + if frequency == 'hourly': + attributes.append('hour') + max_vals.append(23) + for a, m in zip(attributes, max_vals): + df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_sin")] = np.sin(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.__getattribute__(a) / m) + df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_cos")] = np.cos(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.__getattribute__(a) / m) + df_tmp.iloc[-1, df_tmp.columns.get_loc('week_sin')] = np.sin(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.isocalendar().week / 52) + df_tmp.iloc[-1, df_tmp.columns.get_loc('week_cos')] = np.cos(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.isocalendar().week / 52) + + + #make predictions + df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5')] = forecast_model.predict(df_tmp.iloc[-1, df_tmp.columns != 'pm2_5' and df_tmp.columns != 'timestamp' and df_tmp.columns != 'margin_of_error' and df_tmp.columns != 'pm2_5_lower' and df_tmp.columns != 'pm2_5_upper'].values.reshape(1, -1)) + + df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_lower')] = lower_quantile_model.predict(df_tmp.iloc[-1, df_tmp.columns != 'pm2_5' and df_tmp.columns != 'timestamp' and df_tmp.columns != 'margin_of_error' and df_tmp.columns != 'pm2_5_lower' and df_tmp.columns != 'pm2_5_upper'].values.reshape(1, -1)) + + df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_upper')] = upper_quantile_model.predict(df_tmp.iloc[-1, df_tmp.columns != 'pm2_5' and df_tmp.columns != 'timestamp' and df_tmp.columns != 'margin_of_error' and df_tmp.columns != 'pm2_5_lower' and df_tmp.columns != 'pm2_5_upper'].values.reshape(1, -1)) + + df_tmp.iloc[-1, df_tmp.columns.get_loc('margin_of_error')] = (df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_upper')] - df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_lower')]) / 2 + + return df_tmp.iloc[-int(horizon):, :] forecasts = pd.DataFrame() forecast_model = get_trained_model_from_gcs( @@ -438,6 +431,8 @@ def get_new_row( upper_quantile_model = get_trained_model_from_gcs( project_name, bucket_name, f"{frequency}_upper_quantile_model.pkl" ) + + df_tmp = data.copy() for device in df_tmp["device_id"].unique(): test_copy = df_tmp[df_tmp["device_id"] == device] @@ -446,19 +441,17 @@ def get_new_row( if frequency == "hourly" else configuration.DAILY_FORECAST_HORIZON ) - for i in range(int(horizon)): - new_row = get_new_row( + device_forecasts = get_forecasts( test_copy, - device, forecast_model, lower_quantile_model, upper_quantile_model, frequency, + horizon, ) - test_copy = pd.concat( - [test_copy, new_row.to_frame().T], ignore_index=True - ) - forecasts = pd.concat([forecasts, test_copy], ignore_index=True) + + forecasts = pd.concat([forecasts, device_forecasts], ignore_index=True) + forecasts["pm2_5"] = forecasts["pm2_5"].astype(float) forecasts["pm2_5_lower"] = forecasts["pm2_5_lower"].astype(float) @@ -477,6 +470,8 @@ def get_new_row( "site_id", ] ][forecasts["time"] >= current_time_utc] + + decode_categorical_features(result, frequency) return result @staticmethod From df858d5b771da4cc5714bc6b28cacbadf0a0dd33 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Wed, 23 Aug 2023 02:08:08 +0300 Subject: [PATCH 10/43] fix issues in new method for making forecasts --- src/airflow/airqo_etl_utils/ml_utils.py | 238 +++++++++++++++--------- src/airflow/dags/ml_prediction_jobs.py | 11 +- src/airflow/dags/ml_training_jobs.py | 9 +- 3 files changed, 164 insertions(+), 94 deletions(-) diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index c13ba9d7c8..fcec50342e 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -59,6 +59,7 @@ def get_mapping_from_gcs(project_name, bucket_name, source_blob_name): mapping_dict = json.load(f) return mapping_dict + def decode_categorical_features(df, frequency): columns = ["device_id", "site_id", "device_category"] for col in columns: @@ -74,6 +75,7 @@ def decode_categorical_features(df, frequency): df[col] = df[col].map(mapping) return df + class ForecastUtils: @staticmethod def preprocess_data(data, frequency): @@ -176,8 +178,6 @@ def get_time_and_cyclic_features(df, freq): df1.drop(columns=attributes + ["week"], inplace=True) return df1 - - df_tmp = data.copy() df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"]) df_tmp = get_lag_features(df_tmp, target_column, frequency) @@ -186,10 +186,10 @@ def get_time_and_cyclic_features(df, freq): df_tmp = encode_categorical_features(df_tmp, frequency) elif job_type == "predict": df_tmp = decode_categorical_features(df_tmp, frequency) - #convert the categorical columns to int - df_tmp['device_id'] = df_tmp['device_id'].astype(int) - df_tmp['site_id'] = df_tmp['site_id'].astype(int) - df_tmp['device_category'] = df_tmp['device_category'].astype(int) + df_tmp.dropna(subset=["device_id", "site_id", "device_category"], inplace=True) + df_tmp["device_id"] = df_tmp["device_id"].astype(int) + df_tmp["site_id"] = df_tmp["site_id"].astype(int) + df_tmp["device_category"] = df_tmp["device_category"].astype(int) return df_tmp @@ -313,7 +313,10 @@ def objective(trial): alphas = [0.025, 0.975] models = [] - names = [f'{frequency}_lower_quantile_model', f'{frequency}_upper_quantile_model'] + names = [ + f"{frequency}_lower_quantile_model", + f"{frequency}_upper_quantile_model", + ] for alpha in alphas: clf = LGBMRegressor( @@ -337,15 +340,10 @@ def objective(trial): ) models.append(clf) for n, m in zip(names, models): - upload_trained_model_to_gcs( - m, project_id, bucket, f"{n}.pkl" - ) - + upload_trained_model_to_gcs(m, project_id, bucket, f"{n}.pkl") @staticmethod - def generate_forecasts( - data, project_name, bucket_name, frequency - ): + def generate_forecasts(data, project_name, bucket_name, frequency): data["timestamp"] = pd.to_datetime(data["timestamp"]) data["pm2_5_lower"] = data["pm2_5_upper"] = data["margin_of_error"] = 0 @@ -355,84 +353,155 @@ def get_forecasts( lower_quantile_model, upper_quantile_model, frequency, - horizon + horizon, ): """This method generates forecasts for a given device dataframe basing on horizon provided""" df_tmp = df.copy() for i in range(int(horizon)): - df_tmp = pd.concat([df_tmp, df.iloc[-1]], ignore_index=True) - similar_columns = ['site_id', 'device_id', 'device_category', 'latitude', 'longitude'] - for col in similar_columns: - df_tmp.iloc[-1, df_tmp.columns.get_loc(col)] = df_tmp.iloc[-2, df_tmp.columns.get_loc(col)] - - #daily frequency - if frequency == 'daily': - df_tmp.iloc[-1, df_tmp.columns.get_loc('timestamp')] = df.iloc[-2, df_tmp.columns.get_loc('timestamp')] + pd.Timedelta(days=1) - - #lag features - shifts1 = [1,2,3,7,14] - for s in shifts1: - df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_day")] = df_tmp['pm2_5'].shift(s) - - #rolling features - shifts2 = [2,3,7,14] - functions = ['mean', 'std', 'max', 'min'] - for s in shifts2: - for f in functions: - df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_day")] = df_tmp['pm2_5'].shift(1).rolling(s).agg(f) - - # hourly frequency - elif frequency == 'hourly': - df_tmp.iloc[-1, df_tmp.columns.get_loc('timestamp')] = df.iloc[-2, df_tmp.columns.get_loc('timestamp')] + pd.Timedelta(hours=1) - - #lag features - shifts1 = [1,2,6,12] - for s in shifts1: - df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_hour")] = df_tmp['pm2_5'].shift(s) - - #rolling features - shifts2 = [3,6,12,24] - functions = ['mean', 'std', 'median', 'skew'] - for s in shifts2: - for f in functions: - df_tmp.iloc[-1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_hour")] = df_tmp['pm2_5'].shift(1).rolling(s).agg(f) - - #time and cyclic features - attributes = ['year', 'month', 'day', 'dayofweek'] - max_vals = [2023, 12, 30, 7] - if frequency == 'hourly': - attributes.append('hour') - max_vals.append(23) - for a, m in zip(attributes, max_vals): - df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_sin")] = np.sin(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.__getattribute__(a) / m) - df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_cos")] = np.cos(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.__getattribute__(a) / m) - df_tmp.iloc[-1, df_tmp.columns.get_loc('week_sin')] = np.sin(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.isocalendar().week / 52) - df_tmp.iloc[-1, df_tmp.columns.get_loc('week_cos')] = np.cos(2 * np.pi * df_tmp[-1, df_tmp.columns.get_loc('timestamp')].dt.isocalendar().week / 52) - - - #make predictions - df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5')] = forecast_model.predict(df_tmp.iloc[-1, df_tmp.columns != 'pm2_5' and df_tmp.columns != 'timestamp' and df_tmp.columns != 'margin_of_error' and df_tmp.columns != 'pm2_5_lower' and df_tmp.columns != 'pm2_5_upper'].values.reshape(1, -1)) - - df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_lower')] = lower_quantile_model.predict(df_tmp.iloc[-1, df_tmp.columns != 'pm2_5' and df_tmp.columns != 'timestamp' and df_tmp.columns != 'margin_of_error' and df_tmp.columns != 'pm2_5_lower' and df_tmp.columns != 'pm2_5_upper'].values.reshape(1, -1)) - - df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_upper')] = upper_quantile_model.predict(df_tmp.iloc[-1, df_tmp.columns != 'pm2_5' and df_tmp.columns != 'timestamp' and df_tmp.columns != 'margin_of_error' and df_tmp.columns != 'pm2_5_lower' and df_tmp.columns != 'pm2_5_upper'].values.reshape(1, -1)) - - df_tmp.iloc[-1, df_tmp.columns.get_loc('margin_of_error')] = (df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_upper')] - df_tmp.iloc[-1, df_tmp.columns.get_loc('pm2_5_lower')]) / 2 - - return df_tmp.iloc[-int(horizon):, :] + df_tmp = pd.concat([df_tmp, df_tmp.iloc[-1:]],ignore_index=True) + similar_columns = [ + "site_id", + "device_id", + "device_category", + "latitude", + "longitude", + ] + for col in similar_columns: + df_tmp.iloc[-1, df_tmp.columns.get_loc(col)] = df_tmp.iloc[ + -2, df_tmp.columns.get_loc(col) + ] + + # daily frequency + if frequency == "daily": + df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df.iloc[ + -2, df_tmp.columns.get_loc("timestamp") + ] + pd.Timedelta(days=1) + + # lag features + shifts1 = [1, 2, 3, 7, 14] + for s in shifts1: + df_tmp.iloc[ + -1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_day") + ] = df_tmp["pm2_5"].shift(s) + + # rolling features + shifts2 = [2, 3, 7, 14] + functions = ["mean", "std", "max", "min"] + for s in shifts2: + for f in functions: + df_tmp.iloc[ + -1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_day") + ] = (df_tmp["pm2_5"].shift(1).rolling(s).agg(f)) + + # hourly frequency + elif frequency == "hourly": + df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df.iloc[ + -2, df_tmp.columns.get_loc("timestamp") + ] + pd.Timedelta(hours=1) + + # lag features + shifts1 = [1, 2, 6, 12] + for s in shifts1: + df_tmp.iloc[ + -1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_hour") + ] = df_tmp["pm2_5"].shift(s) + + # rolling features + shifts2 = [3, 6, 12, 24] + functions = ["mean", "std", "median", "skew"] + for s in shifts2: + for f in functions: + df_tmp.iloc[ + -1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_hour") + ] = (df_tmp["pm2_5"].shift(1).rolling(s).agg(f)) + + # time and cyclic features + attributes = ["year", "month", "day", "dayofweek"] + max_vals = [2023, 12, 30, 7] + if frequency == "hourly": + attributes.append("hour") + max_vals.append(23) + for a, m in zip(attributes, max_vals): + df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_sin")] = np.sin( + 2 + * np.pi + * df_tmp[ + -1, df_tmp.columns.get_loc("timestamp") + ].dt.__getattribute__(a) + / m + ) + df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_cos")] = np.cos( + 2 + * np.pi + * df_tmp[ + -1, df_tmp.columns.get_loc("timestamp") + ].dt.__getattribute__(a) + / m + ) + df_tmp.iloc[-1, df_tmp.columns.get_loc("week_sin")] = np.sin( + 2 + * np.pi + * df_tmp[-1, df_tmp.columns.get_loc("timestamp")] + .dt.isocalendar() + .week + / 52 + ) + df_tmp.iloc[-1, df_tmp.columns.get_loc("week_cos")] = np.cos( + 2 + * np.pi + * df_tmp[-1, df_tmp.columns.get_loc("timestamp")] + .dt.isocalendar() + .week + / 52 + ) + + # make predictions + excluded_columns = ["pm2_5", "timestamp", "margin_of_error", "pm2_5_lower", "pm2_5_upper"] + df_tmp.iloc[ + -1, df_tmp.columns.get_loc("pm2_5") + ] = forecast_model.predict( + df_tmp.iloc[ + -1, + df_tmp.columns not in excluded_columns, + ].values.reshape(1, -1) + ) + + df_tmp.iloc[ + -1, df_tmp.columns.get_loc("pm2_5_lower") + ] = lower_quantile_model.predict( + df_tmp.iloc[ + -1, + df_tmp.columns not in excluded_columns, + ].values.reshape(1, -1) + ) + + df_tmp.iloc[ + -1, df_tmp.columns.get_loc("pm2_5_upper") + ] = upper_quantile_model.predict( + df_tmp.iloc[ + -1, + df_tmp.columns not in excluded_columns, + ].values.reshape(1, -1) + ) + + df_tmp.iloc[-1, df_tmp.columns.get_loc("margin_of_error")] = ( + df_tmp.iloc[-1, df_tmp.columns.get_loc("pm2_5_upper")] + - df_tmp.iloc[-1, df_tmp.columns.get_loc("pm2_5_lower")] + ) / 2 + + return df_tmp.iloc[-int(horizon) :, :] forecasts = pd.DataFrame() forecast_model = get_trained_model_from_gcs( project_name, bucket_name, f"{frequency}_forecast_model.pkl" ) lower_quantile_model = get_trained_model_from_gcs( - project_name, bucket_name, f"{frequency}_lower_quantile_model.pkl" + project_name, bucket_name, f"{frequency}_lower_quantile_model.pkl" ) upper_quantile_model = get_trained_model_from_gcs( project_name, bucket_name, f"{frequency}_upper_quantile_model.pkl" ) - df_tmp = data.copy() for device in df_tmp["device_id"].unique(): test_copy = df_tmp[df_tmp["device_id"] == device] @@ -442,17 +511,16 @@ def get_forecasts( else configuration.DAILY_FORECAST_HORIZON ) device_forecasts = get_forecasts( - test_copy, - forecast_model, - lower_quantile_model, - upper_quantile_model, - frequency, - horizon, - ) + test_copy, + forecast_model, + lower_quantile_model, + upper_quantile_model, + frequency, + horizon, + ) forecasts = pd.concat([forecasts, device_forecasts], ignore_index=True) - forecasts["pm2_5"] = forecasts["pm2_5"].astype(float) forecasts["pm2_5_lower"] = forecasts["pm2_5_lower"].astype(float) forecasts["pm2_5_upper"] = forecasts["pm2_5_upper"].astype(float) diff --git a/src/airflow/dags/ml_prediction_jobs.py b/src/airflow/dags/ml_prediction_jobs.py index f90233afa4..04972fccd6 100644 --- a/src/airflow/dags/ml_prediction_jobs.py +++ b/src/airflow/dags/ml_prediction_jobs.py @@ -35,11 +35,13 @@ def preprocess_historical_data_hourly_forecast(data): @task def feature_eng_hourly_historical_data(data): - return ForecastUtils.feature_eng_data(data, 'pm2_5', 'hourly', 'predict') + return ForecastUtils.feature_eng_data(data, "pm2_5", "hourly", "predict") @task() def make_hourly_forecasts(data): - return ForecastUtils.generate_forecasts(data=data, project_name=project_id, bucket_name= bucket,frequency='hourly') + return ForecastUtils.generate_forecasts( + data=data, project_name=project_id, bucket_name=bucket, frequency="hourly" + ) @task() def save_hourly_forecasts_to_bigquery(data): @@ -69,12 +71,11 @@ def preprocess_historical_data_daily_forecast(data): @task() def feature_engineer_daily_historical_data(data): - return ForecastUtils.feature_eng_data(data, 'pm2_5', 'daily', 'predict') + return ForecastUtils.feature_eng_data(data, "pm2_5", "daily", "predict") @task() def make_daily_forecasts(data): - return ForecastUtils.generate_forecasts(data, project_id, bucket, 'daily') - + return ForecastUtils.generate_forecasts(data, project_id, bucket, "daily") @task() def save_daily_forecasts_to_bigquery(data): diff --git a/src/airflow/dags/ml_training_jobs.py b/src/airflow/dags/ml_training_jobs.py index 32287ee59c..836441c52a 100644 --- a/src/airflow/dags/ml_training_jobs.py +++ b/src/airflow/dags/ml_training_jobs.py @@ -15,7 +15,6 @@ tags=["airqo", "hourly-forecast", "daily-forecast", "training-job"], ) def train_forecasting_models(): - # Hourly forecast tasks @task() def fetch_training_data_for_hourly_forecast_model(): @@ -28,6 +27,7 @@ def fetch_training_data_for_hourly_forecast_model(): ) start_date = date_to_str(start_date, str_format="%Y-%m-%d") return BigQueryApi().fetch_data(start_date) + @task() def preprocess_training_data_for_hourly_forecast_model(data): return ForecastUtils.preprocess_data(data, "hourly") @@ -38,10 +38,11 @@ def feature_engineer_training_data_for_hourly_forecast_model(data): @task() def train_and_save_hourly_forecast_model(train_data): - return ForecastUtils.train_and_save_forecast_models(train_data, frequency='hourly') - + return ForecastUtils.train_and_save_forecast_models( + train_data, frequency="hourly" + ) -# Daily forecast tasks + # Daily forecast tasks @task() def fetch_training_data_for_daily_forecast_model(): from dateutil.relativedelta import relativedelta From 169222f807e4fe627278eed81e76e08eae702760 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Wed, 23 Aug 2023 10:58:37 +0300 Subject: [PATCH 11/43] Update ml_utils.py --- src/airflow/airqo_etl_utils/ml_utils.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index fcec50342e..475506c650 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -380,19 +380,17 @@ def get_forecasts( # lag features shifts1 = [1, 2, 3, 7, 14] for s in shifts1: - df_tmp.iloc[ - -1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_day") - ] = df_tmp["pm2_5"].shift(s) + df_tmp[f"pm2_5_last_{s}_day"] = df_tmp.shift(s, axis=0)["pm2_5"] + # rolling features shifts2 = [2, 3, 7, 14] functions = ["mean", "std", "max", "min"] for s in shifts2: for f in functions: - df_tmp.iloc[ - -1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_day") - ] = (df_tmp["pm2_5"].shift(1).rolling(s).agg(f)) + df_tmp[f"pm2_5_{f}_{s}_day"] = (df_tmp.shift(1, axis=0).rolling(s).agg(f))["pm2_5"] + print('done') # hourly frequency elif frequency == "hourly": df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df.iloc[ @@ -402,20 +400,17 @@ def get_forecasts( # lag features shifts1 = [1, 2, 6, 12] for s in shifts1: - df_tmp.iloc[ - -1, df_tmp.columns.get_loc(f"pm2_5_last_{s}_hour") - ] = df_tmp["pm2_5"].shift(s) + df_tmp[f"pm2_5_last_{s}_hour"] = df_tmp.shift(s, axis=0)["pm2_5"] + # rolling features shifts2 = [3, 6, 12, 24] functions = ["mean", "std", "median", "skew"] for s in shifts2: for f in functions: - df_tmp.iloc[ - -1, df_tmp.columns.get_loc(f"pm2_5_{f}_{s}_hour") - ] = (df_tmp["pm2_5"].shift(1).rolling(s).agg(f)) + df_tmp[f"pm2_5_{f}_{s}_hour"] = (df_tmp.shift(1, axis=0).rolling(s).agg(f))["pm2_5"] + - # time and cyclic features attributes = ["year", "month", "day", "dayofweek"] max_vals = [2023, 12, 30, 7] if frequency == "hourly": From 167261c4698268f05b1d38e3b12654494bcf4e43 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Thu, 24 Aug 2023 01:07:34 +0300 Subject: [PATCH 12/43] fix forecast generation method --- src/airflow/airqo_etl_utils/ml_utils.py | 121 +++++++----------------- 1 file changed, 34 insertions(+), 87 deletions(-) diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index 475506c650..e5a6c99445 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -1,6 +1,6 @@ import json import random -from datetime import datetime +from datetime import datetime, timedelta import gcsfs import joblib @@ -19,6 +19,7 @@ bucket = configuration.FORECAST_MODELS_BUCKET environment = configuration.ENVIRONMENT +pd.options.mode.chained_assignment = None def get_trained_model_from_gcs(project_name, bucket_name, source_blob_name): fs = gcsfs.GCSFileSystem(project=project_name) @@ -345,10 +346,11 @@ def objective(trial): @staticmethod def generate_forecasts(data, project_name, bucket_name, frequency): data["timestamp"] = pd.to_datetime(data["timestamp"]) - data["pm2_5_lower"] = data["pm2_5_upper"] = data["margin_of_error"] = 0 + data.columns = data.columns.str.strip() + data["pm2_5_lower"] = data["pm2_5_upper"] = data["margin_of_error"] = data["adjusted_forecast"] = 0 def get_forecasts( - df, + df_tmp, forecast_model, lower_quantile_model, upper_quantile_model, @@ -356,44 +358,31 @@ def get_forecasts( horizon, ): """This method generates forecasts for a given device dataframe basing on horizon provided""" - df_tmp = df.copy() for i in range(int(horizon)): - df_tmp = pd.concat([df_tmp, df_tmp.iloc[-1:]],ignore_index=True) - similar_columns = [ - "site_id", - "device_id", - "device_category", - "latitude", - "longitude", - ] - for col in similar_columns: - df_tmp.iloc[-1, df_tmp.columns.get_loc(col)] = df_tmp.iloc[ - -2, df_tmp.columns.get_loc(col) - ] - + df_tmp = pd.concat([df_tmp, df_tmp.iloc[-1:]], ignore_index=True) + df_tmp_no_ts = df_tmp.drop("timestamp", axis=1, inplace=False) # daily frequency if frequency == "daily": - df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df.iloc[ - -2, df_tmp.columns.get_loc("timestamp") - ] + pd.Timedelta(days=1) - - # lag features + df_tmp.tail(1)['timestamp'] += timedelta(days=1) shifts1 = [1, 2, 3, 7, 14] for s in shifts1: df_tmp[f"pm2_5_last_{s}_day"] = df_tmp.shift(s, axis=0)["pm2_5"] - - # rolling features shifts2 = [2, 3, 7, 14] functions = ["mean", "std", "max", "min"] + #review this for s in shifts2: for f in functions: - df_tmp[f"pm2_5_{f}_{s}_day"] = (df_tmp.shift(1, axis=0).rolling(s).agg(f))["pm2_5"] + df_tmp[f"pm2_5_{f}_{s}_day"] = (df_tmp_no_ts.shift(1, axis=0).rolling(s).agg(f))[ + "pm2_5" + ] print('done') - # hourly frequency + + elif frequency == "hourly": - df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df.iloc[ + df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df_tmp.iloc[ + -2, df_tmp.columns.get_loc("timestamp") ] + pd.Timedelta(hours=1) @@ -417,72 +406,28 @@ def get_forecasts( attributes.append("hour") max_vals.append(23) for a, m in zip(attributes, max_vals): - df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_sin")] = np.sin( - 2 - * np.pi - * df_tmp[ - -1, df_tmp.columns.get_loc("timestamp") - ].dt.__getattribute__(a) - / m - ) - df_tmp.iloc[-1, df_tmp.columns.get_loc(f"{a}_cos")] = np.cos( - 2 - * np.pi - * df_tmp[ - -1, df_tmp.columns.get_loc("timestamp") - ].dt.__getattribute__(a) - / m - ) - df_tmp.iloc[-1, df_tmp.columns.get_loc("week_sin")] = np.sin( - 2 - * np.pi - * df_tmp[-1, df_tmp.columns.get_loc("timestamp")] - .dt.isocalendar() - .week - / 52 - ) - df_tmp.iloc[-1, df_tmp.columns.get_loc("week_cos")] = np.cos( - 2 - * np.pi - * df_tmp[-1, df_tmp.columns.get_loc("timestamp")] - .dt.isocalendar() - .week - / 52 - ) + df_tmp.tail(1)[f"{a}_sin"] = np.sin(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.__getattribute__(a) / m) + df_tmp.tail(1)[f"{a}_cos"] = np.cos(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.__getattribute__(a) / m) + df_tmp.tail(1)["week_sin"] = np.sin(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.isocalendar().week / 52) + df_tmp.tail(1)["week_cos"] = np.cos(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.isocalendar().week / 52) # make predictions - excluded_columns = ["pm2_5", "timestamp", "margin_of_error", "pm2_5_lower", "pm2_5_upper"] - df_tmp.iloc[ - -1, df_tmp.columns.get_loc("pm2_5") - ] = forecast_model.predict( - df_tmp.iloc[ - -1, - df_tmp.columns not in excluded_columns, - ].values.reshape(1, -1) + excluded_columns = ["pm2_5", "timestamp", "margin_of_error", "pm2_5_lower", "pm2_5_upper", "adjusted_forecast"] + print(df_tmp.tail(1)) + # df_tmp.tail(1)['pm2_5'] = forecast_model.predict(df_tmp.tail(1).drop(excluded_columns).values.reshape(1, -1)) + df_tmp.loc[df_tmp.index[-1], "pm2_5"] = forecast_model.predict( + df_tmp.drop(excluded_columns, axis=1).tail(1).values.reshape(1, -1) ) - - df_tmp.iloc[ - -1, df_tmp.columns.get_loc("pm2_5_lower") - ] = lower_quantile_model.predict( - df_tmp.iloc[ - -1, - df_tmp.columns not in excluded_columns, - ].values.reshape(1, -1) + df_tmp.loc[df_tmp.index[-1], "pm2_5_lower"] = lower_quantile_model.predict( + df_tmp.drop(excluded_columns, axis=1).tail(1).values.reshape(1, -1) ) - - df_tmp.iloc[ - -1, df_tmp.columns.get_loc("pm2_5_upper") - ] = upper_quantile_model.predict( - df_tmp.iloc[ - -1, - df_tmp.columns not in excluded_columns, - ].values.reshape(1, -1) + df_tmp.loc[df_tmp.index[-1], "pm2_5_upper"] = upper_quantile_model.predict( + df_tmp.drop(excluded_columns, axis=1).tail(1).values.reshape(1, -1) ) + df_tmp.loc[df_tmp.index[-1], "margin_of_error"] = (df_tmp.loc[df_tmp.index[-1], "pm2_5_upper"] - df_tmp.loc[df_tmp.index[-1], "pm2_5_lower"])/2 + df_tmp.loc[df_tmp.index[-1], "adjusted_forecast"] = df_tmp.loc[df_tmp.index[-1], "pm2_5"] + df_tmp.loc[df_tmp.index[-1], "margin_of_error"] + - df_tmp.iloc[-1, df_tmp.columns.get_loc("margin_of_error")] = ( - df_tmp.iloc[-1, df_tmp.columns.get_loc("pm2_5_upper")] - - df_tmp.iloc[-1, df_tmp.columns.get_loc("pm2_5_lower")] - ) / 2 return df_tmp.iloc[-int(horizon) :, :] @@ -515,6 +460,8 @@ def get_forecasts( ) forecasts = pd.concat([forecasts, device_forecasts], ignore_index=True) + print(device) + forecasts["pm2_5"] = forecasts["pm2_5"].astype(float) forecasts["pm2_5_lower"] = forecasts["pm2_5_lower"].astype(float) From 31e9ec142cb0a03dc38e5ff4a7ca2fd83da84c5f Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Thu, 24 Aug 2023 02:25:10 +0300 Subject: [PATCH 13/43] Update ml_utils.py --- src/airflow/airqo_etl_utils/ml_utils.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index e5a6c99445..b4e6b59dab 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -370,7 +370,6 @@ def get_forecasts( # rolling features shifts2 = [2, 3, 7, 14] functions = ["mean", "std", "max", "min"] - #review this for s in shifts2: for f in functions: df_tmp[f"pm2_5_{f}_{s}_day"] = (df_tmp_no_ts.shift(1, axis=0).rolling(s).agg(f))[ @@ -467,22 +466,8 @@ def get_forecasts( forecasts["pm2_5_lower"] = forecasts["pm2_5_lower"].astype(float) forecasts["pm2_5_upper"] = forecasts["pm2_5_upper"].astype(float) forecasts["margin_of_error"] = forecasts["margin_of_error"].astype(float) - current_time_utc = pd.Timestamp(datetime.utcnow(), tz="UTC") - forecasts.rename(columns={"timestamp": "time"}, inplace=True) - result = forecasts[ - [ - "timestamp", - "pm2_5", - "pm2_5_lower", - "pm2_5_upper", - "margin_of_error", - "device_id", - "site_id", - ] - ][forecasts["time"] >= current_time_utc] - - decode_categorical_features(result, frequency) - return result + decode_categorical_features(forecasts, frequency) + return forecasts @staticmethod def save_forecasts_to_mongo(data, frequency): From 4dfa9acf8faa0b883690c0d993e672b966ec4366 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Fri, 25 Aug 2023 23:18:34 +0300 Subject: [PATCH 14/43] refactor endpoints --- src/predict/api/dump.rdb | Bin 89 -> 0 bytes src/predict/api/helpers.py | 14 +++++++++----- src/predict/api/prediction.py | 13 +++++++++++++ 3 files changed, 22 insertions(+), 5 deletions(-) delete mode 100644 src/predict/api/dump.rdb diff --git a/src/predict/api/dump.rdb b/src/predict/api/dump.rdb deleted file mode 100644 index 3890d66e38ccf5adc5d018cc4ea4f93fb47d027f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 89 zcmWG?b@2=~FfcUu#aWb^l3A=2Jaq0nv{|u`R$>lYFJ`4clZ6wnG diff --git a/src/predict/api/helpers.py b/src/predict/api/helpers.py index b9edb0682b..baa1777d93 100644 --- a/src/predict/api/helpers.py +++ b/src/predict/api/helpers.py @@ -85,8 +85,8 @@ def geo_coordinates_cache_key(): def get_health_tips() -> list[dict]: try: response = requests.get( - f"{Config.AIRQO_BASE_URL}/api/v2/devices/tips?token={Config.AIRQO_API_AUTH_TOKEN}", - timeout=3, + f"{Config.AIRQO_BASE_URL}api/v2/devices/tips?token={Config.AIRQO_API_AUTH_TOKEN}", + timeout=10, ) result = response.json() return result["tips"] @@ -186,6 +186,7 @@ def get_predictions_by_geo_coordinates_v2(latitude: float, longitude: float) -> @cache.memoize(timeout=Config.CACHE_TIMEOUT) def get_forecasts( db_name, + device_id=None, site_id=None, site_name=None, parish=None, @@ -196,6 +197,7 @@ def get_forecasts( ): query = {} params = { + "device_id": device_id, "site_id": site_id, "site_name": site_name, "parish": parish, @@ -213,14 +215,16 @@ def get_forecasts( results = [] if site_forecasts: - for time, pm2_5, health_tips in zip( - site_forecasts[0]["time"], + for time, pm2_5, margin_of_error, adjusted_forecast in zip( + site_forecasts[0]["timestamp"], site_forecasts[0]["pm2_5"], + site_forecasts[0]["margin_of_error"], + site_forecasts[0]["adjusted_forecast"], ): result = { key: value for key, value in zip( - ["time", "pm2_5"], [time, pm2_5] + ["time", "pm2_5", "margin_of_error", "adjusted_forecast"], [time, pm2_5, margin_of_error, adjusted_forecast] ) } results.append(result) diff --git a/src/predict/api/prediction.py b/src/predict/api/prediction.py index 97335f31a6..068ae4ce48 100644 --- a/src/predict/api/prediction.py +++ b/src/predict/api/prediction.py @@ -124,6 +124,7 @@ def get_next_1_week_forecasts(): params = { name: request.args.get(name, default=None, type=str) for name in [ + "device_id", "site_id", "site_name", "parish", @@ -145,6 +146,18 @@ def get_next_1_week_forecasts(): ) result = get_forecasts(**params, db_name="daily_forecasts") if result: + health_tips = get_health_tips() + for forecast in result["forecasts"]: + pm2_5 = forecast["pm2_5"] + forecast["health_tips"] = list( + filter( + lambda x: x["aqi_category"]["max"] + >= pm2_5 + >= x["aqi_category"]["min"], + health_tips, + ) + ) + response = result else: response = { From b31a0fb61edb1e40805ffd01865b378335663d76 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Sat, 26 Aug 2023 01:28:51 +0300 Subject: [PATCH 15/43] code cleanup --- src/airflow/airqo_etl_utils/ml_utils.py | 437 +++++++++++++++--------- src/airflow/dags/ml_training_jobs.py | 13 +- src/airflow/requirements.txt | 2 +- src/predict/api/helpers.py | 9 +- src/predict/api/prediction.py | 8 +- 5 files changed, 290 insertions(+), 179 deletions(-) diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index b4e6b59dab..591f72e133 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -21,70 +21,115 @@ pd.options.mode.chained_assignment = None -def get_trained_model_from_gcs(project_name, bucket_name, source_blob_name): - fs = gcsfs.GCSFileSystem(project=project_name) - fs.ls(bucket_name) - with fs.open(bucket_name + "/" + source_blob_name, "rb") as handle: - job = joblib.load(handle) - return job - - -def upload_trained_model_to_gcs( - trained_model, project_name, bucket_name, source_blob_name -): - fs = gcsfs.GCSFileSystem(project=project_name) - try: - fs.rename( - f"{bucket_name}/{source_blob_name}", - f"{bucket_name}/{datetime.now()}-{source_blob_name}", - ) - print("Bucket: previous model is backed up") - except: - print("Bucket: No file to updated") - # store new model - with fs.open(bucket_name + "/" + source_blob_name, "wb") as handle: - job = joblib.dump(trained_model, handle) +class GCSUtils: + # TODO: In future, save and retrieve models from mlflow instead of GCS + @staticmethod + def get_trained_model_from_gcs(project_name, bucket_name, source_blob_name): + fs = gcsfs.GCSFileSystem(project=project_name) + fs.ls(bucket_name) + with fs.open(bucket_name + "/" + source_blob_name, "rb") as handle: + job = joblib.load(handle) + return job + @staticmethod + def upload_trained_model_to_gcs( + trained_model, project_name, bucket_name, source_blob_name + ): + fs = gcsfs.GCSFileSystem(project=project_name) + try: + fs.rename( + f"{bucket_name}/{source_blob_name}", + f"{bucket_name}/{datetime.now()}-{source_blob_name}", + ) + print("Bucket: previous model is backed up") + except: + print("Bucket: No file to updated") -def upload_mapping_to_gcs(mapping_dict, project_name, bucket_name, source_blob_name): - fs = gcsfs.GCSFileSystem(project=project_name) - mapping_dict = json.dumps(mapping_dict) - with fs.open(bucket_name + "/" + source_blob_name, "w") as f: - f.write(mapping_dict) + with fs.open(bucket_name + "/" + source_blob_name, "wb") as handle: + job = joblib.dump(trained_model, handle) + @staticmethod + def upload_mapping_to_gcs( + mapping_dict, project_name, bucket_name, source_blob_name + ): + fs = gcsfs.GCSFileSystem(project=project_name) + mapping_dict = json.dumps(mapping_dict) + with fs.open(bucket_name + "/" + source_blob_name, "w") as f: + f.write(mapping_dict) -def get_mapping_from_gcs(project_name, bucket_name, source_blob_name): - fs = gcsfs.GCSFileSystem(project=project_name) - with fs.open(bucket_name + "/" + source_blob_name, "r") as f: - mapping_dict = json.load(f) - return mapping_dict + @staticmethod + def get_mapping_from_gcs(project_name, bucket_name, source_blob_name): + fs = gcsfs.GCSFileSystem(project=project_name) + with fs.open(bucket_name + "/" + source_blob_name, "r") as f: + mapping_dict = json.load(f) + return mapping_dict -def decode_categorical_features(df, frequency): - columns = ["device_id", "site_id", "device_category"] - for col in columns: - if frequency == "hourly": - mapping = get_mapping_from_gcs( - project_id, bucket, f"hourly_{col}_mapping.json" - ) - elif frequency == "daily": - mapping = get_mapping_from_gcs( - project_id, bucket, f"daily_{col}_mapping.json" - ) +class DecodingUtils: + @staticmethod + def decode_categorical_features_pred(df, frequency): + columns = ["device_id", "site_id", "device_category"] + mapping = {} + for col in columns: + if frequency == "hourly": + mapping = GCSUtils.get_mapping_from_gcs( + project_id, bucket, f"hourly_{col}_mapping.json" + ) + elif frequency == "daily": + mapping = GCSUtils.get_mapping_from_gcs( + project_id, bucket, f"daily_{col}_mapping.json" + ) + df[col] = df[col].map(mapping) + return df - df[col] = df[col].map(mapping) - return df + @staticmethod + def decode_categorical_features_before_save(df, frequency): + columns = ["device_id", "site_id", "device_category"] + mapping = {} + for col in columns: + if frequency == "hourly": + mapping = GCSUtils.get_mapping_from_gcs( + project_id, bucket, f"hourly_{col}_mapping.json" + ) + elif frequency == "daily": + mapping = GCSUtils.get_mapping_from_gcs( + project_id, bucket, f"daily_{col}_mapping.json" + ) + df[col] = df[col].map({v: k for k, v in mapping.items()}) + return df + + def encode_categorical_training_features(df, freq): + df1 = df.copy() + columns = ["device_id", "site_id", "device_category"] + mappings = [] + for col in columns: + mapping = {} + for val in df1[col].unique(): + num = random.randint(0, 1000) + while num in mapping.values(): + num = random.randint(0, 1000) + mapping[val] = num + df1[col] = df1[col].map(mapping) + mappings.append(mapping) + for i, col in enumerate(columns): + GCSUtils.upload_mapping_to_gcs( + mappings[i], + project_id, + bucket, + f"{freq}_{col}_mapping.json", + ) + return df1 class ForecastUtils: @staticmethod - def preprocess_data(data, frequency): + def preprocess_data(data, data_frequency): data["timestamp"] = pd.to_datetime(data["timestamp"]) data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])[ "pm2_5" ].transform(lambda x: x.interpolate(method="linear", limit_direction="both")) - if frequency == "daily": + if data_frequency == "daily": data = ( data.groupby(["device_id", "site_id", "device_category"]) .resample("D", on="timestamp") @@ -100,9 +145,9 @@ def preprocess_data(data, frequency): return data @staticmethod - def feature_eng_data(data, target_column, frequency, job_type): + def feature_eng_data(data, target_column, data_frequency, job_type): def get_lag_features(df, target_col, freq): - df1 = df.copy() + df1 = df.copy() # use copy to prevent terminal warning if freq == "daily": shifts = [1, 2, 3, 7, 14] for s in shifts: @@ -142,25 +187,6 @@ def get_lag_features(df, target_col, freq): return df1 - def encode_categorical_features(df, frequency): - df1 = df.copy() - columns = ["device_id", "site_id", "device_category"] - mappings = [] - for col in columns: - mapping = {} - for val in df1[col].unique(): - num = random.randint(0, 10000) - while num in mapping.values(): - num = random.randint(0, 10000) - mapping[val] = num - df1[col] = df1[col].map(mapping) - mappings.append(mapping) - for i, col in enumerate(columns): - upload_mapping_to_gcs( - mappings[i], project_id, bucket, f"{frequency}_{col}_mapping.json" - ) - return df1 - def get_time_and_cyclic_features(df, freq): df1 = df.copy() attributes = ["year", "month", "day", "dayofweek"] @@ -181,13 +207,20 @@ def get_time_and_cyclic_features(df, freq): df_tmp = data.copy() df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"]) - df_tmp = get_lag_features(df_tmp, target_column, frequency) - df_tmp = get_time_and_cyclic_features(df_tmp, frequency) + df_tmp = get_lag_features(df_tmp, target_column, data_frequency) + df_tmp = get_time_and_cyclic_features(df_tmp, data_frequency) if job_type == "train": - df_tmp = encode_categorical_features(df_tmp, frequency) + df_tmp = DecodingUtils.encode_categorical_training_features( + df_tmp, data_frequency + ) elif job_type == "predict": - df_tmp = decode_categorical_features(df_tmp, frequency) - df_tmp.dropna(subset=["device_id", "site_id", "device_category"], inplace=True) + df_tmp = DecodingUtils.decode_categorical_features_pred( + df_tmp, data_frequency + ) + df_tmp.dropna( + subset=["device_id", "site_id", "device_category"], inplace=True + ) # only 1 row, not sure why + df_tmp["device_id"] = df_tmp["device_id"].astype(int) df_tmp["site_id"] = df_tmp["site_id"].astype(int) df_tmp["device_category"] = df_tmp["device_category"].astype(int) @@ -195,27 +228,27 @@ def get_time_and_cyclic_features(df, freq): return df_tmp @staticmethod - def train_and_save_forecast_models(train, frequency): + def train_and_save_forecast_models(training_data, frequency): """ Perform the actual training for hourly data """ - train["timestamp"] = pd.to_datetime(train["timestamp"]) - features = [c for c in train.columns if c not in ["timestamp", "pm2_5"]] + training_data["timestamp"] = pd.to_datetime(training_data["timestamp"]) + features = [c for c in training_data.columns if c not in ["timestamp", "pm2_5"]] print(features) target_col = "pm2_5" train_data = validation_data = test_data = pd.DataFrame() - for device in train["device_id"].unique(): - device_df = train[train["device_id"] == device] + for device in training_data["device_id"].unique(): + device_df = training_data[training_data["device_id"] == device] months = device_df["timestamp"].dt.month.unique() train_months = val_months = test_months = [] if frequency == "hourly": - train_months = months[:4] - val_months = months[4:5] - test_months = months[5:] + train_months = months[:2] + val_months = months[2:3] + test_months = months[3:] elif frequency == "daily": - train_months = months[:8] - val_months = months[8:9] - test_months = months[9:] + train_months = months[:6] + val_months = months[6:7] + test_months = months[7:] train_df = device_df[device_df["timestamp"].dt.month.isin(train_months)] val_df = device_df[device_df["timestamp"].dt.month.isin(val_months)] @@ -308,52 +341,103 @@ def objective(trial): callbacks=[early_stopping(stopping_rounds=150)], ) - upload_trained_model_to_gcs( + GCSUtils.upload_trained_model_to_gcs( clf, project_id, bucket, f"{frequency}_forecast_model.pkl" ) - alphas = [0.025, 0.975] - models = [] - names = [ - f"{frequency}_lower_quantile_model", - f"{frequency}_upper_quantile_model", - ] - - for alpha in alphas: - clf = LGBMRegressor( - n_estimators=best_params["n_estimators"], - learning_rate=best_params["learning_rate"], - colsample_bytree=best_params["colsample_bytree"], - reg_alpha=best_params["reg_alpha"], - reg_lambda=best_params["reg_lambda"], - max_depth=best_params["max_depth"], - random_state=42, - verbosity=2, - objective="quantile", - alpha=alpha, - metric="quantile", + def create_error_df(data, target, preds): + error_df = pd.DataFrame( + { + "actual_values": target, + "predicted_values": preds, + } ) - clf.fit( - train_data[features], - train_target, - eval_set=[(test_data[features], test_target)], - categorical_feature=["device_id", "site_id", "device_category"], + error_df["errors"] = ( + error_df["predicted_values"] - error_df["actual_values"] ) - models.append(clf) - for n, m in zip(names, models): - upload_trained_model_to_gcs(m, project_id, bucket, f"{n}.pkl") + error_df = pd.concat([error_df, data], axis=1) + error_df.drop(["actual_values", "pm2_5"], axis=1, inplace=True) + error_df.rename(columns={"predicted_values": "pm2_5"}, inplace=True) + + return error_df + + error_df1 = create_error_df( + train_data, train_target, clf.predict(train_data[features]) + ) + error_df2 = create_error_df( + test_data, test_target, clf.predict(test_data[features]) + ) + + error_features1 = [c for c in error_df1.columns if c not in ["errors"]] + error_features2 = [c for c in error_df2.columns if c not in ["errors"]] + + error_target1 = error_df1["errors"] + error_target2 = error_df2["errors"] + + error_clf = LGBMRegressor( + n_estimators=31, + colsample_bytree=1, + learning_rate=0.1, + metric="rmse", + max_depth=5, + random_state=42, + verbosity=2, + ) + + error_clf.fit( + error_df1[error_features1], + error_target1, + eval_set=[(error_df2[error_features2], error_target2)], + categorical_feature=["device_id", "site_id", "device_category"], + callbacks=[early_stopping(stopping_rounds=150)], + ) + + GCSUtils.upload_trained_model_to_gcs( + error_clf, project_id, bucket, f"{frequency}_error_model.pkl" + ) + + # TODO: quantile regression approach + # alphas = [0.025, 0.975] + # models = [] + # names = [ + # f"{frequency}_lower_quantile_model", + # f"{frequency}_upper_quantile_model", + # ] + # + # for alpha in alphas: + # clf = LGBMRegressor( + # n_estimators=best_params["n_estimators"], + # learning_rate=best_params["learning_rate"], + # colsample_bytree=best_params["colsample_bytree"], + # reg_alpha=best_params["reg_alpha"], + # reg_lambda=best_params["reg_lambda"], + # max_depth=best_params["max_depth"], + # random_state=42, + # verbosity=2, + # objective="quantile", + # alpha=alpha, + # metric="quantile", + # ) + # clf.fit( + # train_data[features], + # train_target, + # eval_set=[(test_data[features], test_target)], + # categorical_feature=["device_id", "site_id", "device_category"], + # ) + # models.append(clf) + # for n, m in zip(names, models): + # upload_trained_model_to_gcs(m, project_id, bucket, f"{n}.pkl") @staticmethod def generate_forecasts(data, project_name, bucket_name, frequency): data["timestamp"] = pd.to_datetime(data["timestamp"]) data.columns = data.columns.str.strip() - data["pm2_5_lower"] = data["pm2_5_upper"] = data["margin_of_error"] = data["adjusted_forecast"] = 0 + data["margin_of_error"] = data["adjusted_forecast"] = 0 def get_forecasts( df_tmp, forecast_model, - lower_quantile_model, - upper_quantile_model, + error_model, frequency, horizon, ): @@ -363,7 +447,7 @@ def get_forecasts( df_tmp_no_ts = df_tmp.drop("timestamp", axis=1, inplace=False) # daily frequency if frequency == "daily": - df_tmp.tail(1)['timestamp'] += timedelta(days=1) + df_tmp.tail(1)["timestamp"] += timedelta(days=1) shifts1 = [1, 2, 3, 7, 14] for s in shifts1: df_tmp[f"pm2_5_last_{s}_day"] = df_tmp.shift(s, axis=0)["pm2_5"] @@ -372,32 +456,32 @@ def get_forecasts( functions = ["mean", "std", "max", "min"] for s in shifts2: for f in functions: - df_tmp[f"pm2_5_{f}_{s}_day"] = (df_tmp_no_ts.shift(1, axis=0).rolling(s).agg(f))[ - "pm2_5" - ] - - print('done') + df_tmp[f"pm2_5_{f}_{s}_day"] = ( + df_tmp_no_ts.shift(1, axis=0).rolling(s).agg(f) + )["pm2_5"] + print("done") elif frequency == "hourly": df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df_tmp.iloc[ - -2, df_tmp.columns.get_loc("timestamp") ] + pd.Timedelta(hours=1) # lag features shifts1 = [1, 2, 6, 12] for s in shifts1: - df_tmp[f"pm2_5_last_{s}_hour"] = df_tmp.shift(s, axis=0)["pm2_5"] - + df_tmp[f"pm2_5_last_{s}_hour"] = df_tmp.shift(s, axis=0)[ + "pm2_5" + ] # rolling features shifts2 = [3, 6, 12, 24] functions = ["mean", "std", "median", "skew"] for s in shifts2: for f in functions: - df_tmp[f"pm2_5_{f}_{s}_hour"] = (df_tmp.shift(1, axis=0).rolling(s).agg(f))["pm2_5"] - + df_tmp[f"pm2_5_{f}_{s}_hour"] = ( + df_tmp.shift(1, axis=0).rolling(s).agg(f) + )["pm2_5"] attributes = ["year", "month", "day", "dayofweek"] max_vals = [2023, 12, 30, 7] @@ -405,40 +489,59 @@ def get_forecasts( attributes.append("hour") max_vals.append(23) for a, m in zip(attributes, max_vals): - df_tmp.tail(1)[f"{a}_sin"] = np.sin(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.__getattribute__(a) / m) - df_tmp.tail(1)[f"{a}_cos"] = np.cos(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.__getattribute__(a) / m) - df_tmp.tail(1)["week_sin"] = np.sin(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.isocalendar().week / 52) - df_tmp.tail(1)["week_cos"] = np.cos(2 * np.pi * df_tmp.tail(1)["timestamp"].dt.isocalendar().week / 52) + df_tmp.tail(1)[f"{a}_sin"] = np.sin( + 2 + * np.pi + * df_tmp.tail(1)["timestamp"].dt.__getattribute__(a) + / m + ) + df_tmp.tail(1)[f"{a}_cos"] = np.cos( + 2 + * np.pi + * df_tmp.tail(1)["timestamp"].dt.__getattribute__(a) + / m + ) + df_tmp.tail(1)["week_sin"] = np.sin( + 2 * np.pi * df_tmp.tail(1)["timestamp"].dt.isocalendar().week / 52 + ) + df_tmp.tail(1)["week_cos"] = np.cos( + 2 * np.pi * df_tmp.tail(1)["timestamp"].dt.isocalendar().week / 52 + ) # make predictions - excluded_columns = ["pm2_5", "timestamp", "margin_of_error", "pm2_5_lower", "pm2_5_upper", "adjusted_forecast"] + excluded_columns = [ + "pm2_5", + "timestamp", + "margin_of_error", + "adjusted_forecast", + ] + excluded_columns_2 = [ + "timestamp", + "margin_of_error", + "adjusted_forecast", + ] print(df_tmp.tail(1)) - # df_tmp.tail(1)['pm2_5'] = forecast_model.predict(df_tmp.tail(1).drop(excluded_columns).values.reshape(1, -1)) df_tmp.loc[df_tmp.index[-1], "pm2_5"] = forecast_model.predict( df_tmp.drop(excluded_columns, axis=1).tail(1).values.reshape(1, -1) ) - df_tmp.loc[df_tmp.index[-1], "pm2_5_lower"] = lower_quantile_model.predict( - df_tmp.drop(excluded_columns, axis=1).tail(1).values.reshape(1, -1) + df_tmp.loc[df_tmp.index[-1], "margin_of_error"] = error_model.predict( + df_tmp.drop(excluded_columns_2, axis=1) + .tail(1) + .values.reshape(1, -1) ) - df_tmp.loc[df_tmp.index[-1], "pm2_5_upper"] = upper_quantile_model.predict( - df_tmp.drop(excluded_columns, axis=1).tail(1).values.reshape(1, -1) + df_tmp.loc[df_tmp.index[-1], "adjusted_forecast"] = ( + df_tmp.loc[df_tmp.index[-1], "pm2_5"] + + df_tmp.loc[df_tmp.index[-1], "margin_of_error"] ) - df_tmp.loc[df_tmp.index[-1], "margin_of_error"] = (df_tmp.loc[df_tmp.index[-1], "pm2_5_upper"] - df_tmp.loc[df_tmp.index[-1], "pm2_5_lower"])/2 - df_tmp.loc[df_tmp.index[-1], "adjusted_forecast"] = df_tmp.loc[df_tmp.index[-1], "pm2_5"] + df_tmp.loc[df_tmp.index[-1], "margin_of_error"] - - return df_tmp.iloc[-int(horizon) :, :] forecasts = pd.DataFrame() - forecast_model = get_trained_model_from_gcs( + forecast_model = GCSUtils.get_trained_model_from_gcs( project_name, bucket_name, f"{frequency}_forecast_model.pkl" ) - lower_quantile_model = get_trained_model_from_gcs( - project_name, bucket_name, f"{frequency}_lower_quantile_model.pkl" - ) - upper_quantile_model = get_trained_model_from_gcs( - project_name, bucket_name, f"{frequency}_upper_quantile_model.pkl" + error_model = GCSUtils.get_trained_model_from_gcs( + project_name, bucket_name, f"{frequency}_error_model.pkl" ) df_tmp = data.copy() @@ -452,8 +555,7 @@ def get_forecasts( device_forecasts = get_forecasts( test_copy, forecast_model, - lower_quantile_model, - upper_quantile_model, + error_model, frequency, horizon, ) @@ -461,33 +563,46 @@ def get_forecasts( forecasts = pd.concat([forecasts, device_forecasts], ignore_index=True) print(device) - forecasts["pm2_5"] = forecasts["pm2_5"].astype(float) - forecasts["pm2_5_lower"] = forecasts["pm2_5_lower"].astype(float) - forecasts["pm2_5_upper"] = forecasts["pm2_5_upper"].astype(float) forecasts["margin_of_error"] = forecasts["margin_of_error"].astype(float) - decode_categorical_features(forecasts, frequency) + + DecodingUtils.decode_categorical_features_before_save(forecasts, frequency) + forecasts = forecasts[ + [ + "device_id", + "site_id", + "timestamp", + "pm2_5", + "margin_of_error", + "adjusted_forecast", + ] + ] return forecasts @staticmethod def save_forecasts_to_mongo(data, frequency): - timestamp = pd.to_datetime(datetime.now()).isoformat() - device_numbers = data["device_number"].unique() + device_ids = data["device_id"].unique() + created_at = pd.to_datetime(datetime.now()).isoformat() forecast_results = [ { - field: data[data["device_number"] == i][field].tolist()[0] - if field != "pm2_5" and field != "time" and field != "health_tips" - else data[data["device_number"] == i][field].tolist() + field: data[data["device_id"] == i][field].tolist()[0] + if field + not in ["pm2_5", "margin_of_error", "adjusted_forecast", "timestamp"] + else data[data["device_id"] == i][field].tolist() for field in data.columns } - | {"timestamp": timestamp} - for i in device_numbers + | {"created_at": created_at} + for i in device_ids ] client = pm.MongoClient(configuration.MONGO_URI) db = client[configuration.MONGO_DATABASE_NAME] if frequency == "hourly": + db.daily_forecasts.delete_many({}) db.hourly_forecasts.insert_many(forecast_results) + print(db.hourly_forecasts.find_one()) # confirm saving has worked elif frequency == "daily": + db.daily_forecasts.delete_many({}) db.daily_forecasts.insert_many(forecast_results) + print(db.daily_forecasts.find_one()) else: raise ValueError("Invalid frequency argument") diff --git a/src/airflow/dags/ml_training_jobs.py b/src/airflow/dags/ml_training_jobs.py index 836441c52a..a517d6a3a7 100644 --- a/src/airflow/dags/ml_training_jobs.py +++ b/src/airflow/dags/ml_training_jobs.py @@ -1,10 +1,12 @@ from airflow.decorators import dag, task from airqo_etl_utils.airflow_custom_utils import AirflowUtils -from airqo_etl_utils.bigquery_api import BigQueryApi from airqo_etl_utils.config import configuration -from airqo_etl_utils.date import date_to_str from airqo_etl_utils.ml_utils import ForecastUtils +from airqo_etl_utils.date import date_to_str +from dateutil.relativedelta import relativedelta +from airqo_etl_utils.bigquery_api import BigQueryApi +from datetime import datetime @dag( @@ -18,9 +20,6 @@ def train_forecasting_models(): # Hourly forecast tasks @task() def fetch_training_data_for_hourly_forecast_model(): - from dateutil.relativedelta import relativedelta - from datetime import datetime - current_date = datetime.today() start_date = current_date - relativedelta( months=int(configuration.HOURLY_FORECAST_TRAINING_JOB_SCOPE) @@ -33,7 +32,7 @@ def preprocess_training_data_for_hourly_forecast_model(data): return ForecastUtils.preprocess_data(data, "hourly") @task() - def feature_engineer_training_data_for_hourly_forecast_model(data): + def feat_engineer_training_data_for_hourly_forecast_model(data): return ForecastUtils.feature_eng_data(data, "pm2_5", "hourly", "train") @task() @@ -69,7 +68,7 @@ def train_and_save_daily_model(train_data): hourly_data = fetch_training_data_for_hourly_forecast_model() hourly_data = preprocess_training_data_for_hourly_forecast_model(hourly_data) - hourly_data = feature_engineer_training_data_for_hourly_forecast_model(hourly_data) + hourly_data = feat_engineer_training_data_for_hourly_forecast_model(hourly_data) train_and_save_hourly_forecast_model(hourly_data) daily_data = fetch_training_data_for_daily_forecast_model() diff --git a/src/airflow/requirements.txt b/src/airflow/requirements.txt index 7396bc939c..f5df386d68 100644 --- a/src/airflow/requirements.txt +++ b/src/airflow/requirements.txt @@ -7,7 +7,7 @@ kafka-python simplejson~=3.19.1 sentry-sdk geopy -mlflow~=2.5.0 +mlflow lightgbm~=4.0.0 setuptools~=68.0.0 urllib3~=1.26.16 diff --git a/src/predict/api/helpers.py b/src/predict/api/helpers.py index baa1777d93..f455387d2b 100644 --- a/src/predict/api/helpers.py +++ b/src/predict/api/helpers.py @@ -186,7 +186,7 @@ def get_predictions_by_geo_coordinates_v2(latitude: float, longitude: float) -> @cache.memoize(timeout=Config.CACHE_TIMEOUT) def get_forecasts( db_name, - device_id=None, + device_id=None, site_id=None, site_name=None, parish=None, @@ -218,13 +218,14 @@ def get_forecasts( for time, pm2_5, margin_of_error, adjusted_forecast in zip( site_forecasts[0]["timestamp"], site_forecasts[0]["pm2_5"], - site_forecasts[0]["margin_of_error"], - site_forecasts[0]["adjusted_forecast"], + site_forecasts[0]["margin_of_error"], + site_forecasts[0]["adjusted_forecast"], ): result = { key: value for key, value in zip( - ["time", "pm2_5", "margin_of_error", "adjusted_forecast"], [time, pm2_5, margin_of_error, adjusted_forecast] + ["time", "pm2_5", "margin_of_error", "adjusted_forecast"], + [time, pm2_5, margin_of_error, adjusted_forecast], ) } results.append(result) diff --git a/src/predict/api/prediction.py b/src/predict/api/prediction.py index 068ae4ce48..a58bd2ae82 100644 --- a/src/predict/api/prediction.py +++ b/src/predict/api/prediction.py @@ -77,10 +77,6 @@ def get_next_24hr_forecasts(): """ Get forecasts for the next 24 hours from specified start time. """ - - """ - Get forecasts for the next 1 week from specified start day. - """ params = { name: request.args.get(name, default=None, type=str) for name in [ @@ -152,8 +148,8 @@ def get_next_1_week_forecasts(): forecast["health_tips"] = list( filter( lambda x: x["aqi_category"]["max"] - >= pm2_5 - >= x["aqi_category"]["min"], + >= pm2_5 + >= x["aqi_category"]["min"], health_tips, ) ) From 071d35120aa07992a6bc76c429d31e146602fc7e Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Thu, 31 Aug 2023 00:27:44 +0300 Subject: [PATCH 16/43] disable error functionality --- src/airflow/airqo_etl_utils/ml_utils.py | 195 ++++++++++++------------ 1 file changed, 98 insertions(+), 97 deletions(-) diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index 591f72e133..630d936b90 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -99,6 +99,7 @@ def decode_categorical_features_before_save(df, frequency): df[col] = df[col].map({v: k for k, v in mapping.items()}) return df + @staticmethod def encode_categorical_training_features(df, freq): df1 = df.copy() columns = ["device_id", "site_id", "device_category"] @@ -205,10 +206,18 @@ def get_time_and_cyclic_features(df, freq): df1.drop(columns=attributes + ["week"], inplace=True) return df1 + def get_location_cord(df): + df["x_cord"] = np.cos(df["latitude"]) * np.cos(df["longitude"]) + df["y_cord"] = np.cos(df["latitude"]) * np.sin(df["longitude"]) + df["z_cord"] = np.sin(df["latitude"]) + + return df + df_tmp = data.copy() df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"]) df_tmp = get_lag_features(df_tmp, target_column, data_frequency) df_tmp = get_time_and_cyclic_features(df_tmp, data_frequency) + df_tmp = get_location_cord(df_tmp) if job_type == "train": df_tmp = DecodingUtils.encode_categorical_training_features( df_tmp, data_frequency @@ -233,23 +242,20 @@ def train_and_save_forecast_models(training_data, frequency): Perform the actual training for hourly data """ training_data["timestamp"] = pd.to_datetime(training_data["timestamp"]) - features = [c for c in training_data.columns if c not in ["timestamp", "pm2_5"]] + features = [ + c + for c in training_data.columns + if c not in ["timestamp", "pm2_5", "latitude", "longitude"] + ] print(features) target_col = "pm2_5" train_data = validation_data = test_data = pd.DataFrame() for device in training_data["device_id"].unique(): device_df = training_data[training_data["device_id"] == device] months = device_df["timestamp"].dt.month.unique() - train_months = val_months = test_months = [] - if frequency == "hourly": - train_months = months[:2] - val_months = months[2:3] - test_months = months[3:] - elif frequency == "daily": - train_months = months[:6] - val_months = months[6:7] - test_months = months[7:] - + train_months = months[:8] + val_months = months[8:9] + test_months = months[9:] train_df = device_df[device_df["timestamp"].dt.month.isin(train_months)] val_df = device_df[device_df["timestamp"].dt.month.isin(val_months)] test_df = device_df[device_df["timestamp"].dt.month.isin(test_months)] @@ -345,56 +351,56 @@ def objective(trial): clf, project_id, bucket, f"{frequency}_forecast_model.pkl" ) - def create_error_df(data, target, preds): - error_df = pd.DataFrame( - { - "actual_values": target, - "predicted_values": preds, - } - ) - error_df["errors"] = ( - error_df["predicted_values"] - error_df["actual_values"] - ) - error_df = pd.concat([error_df, data], axis=1) - error_df.drop(["actual_values", "pm2_5"], axis=1, inplace=True) - error_df.rename(columns={"predicted_values": "pm2_5"}, inplace=True) - - return error_df - - error_df1 = create_error_df( - train_data, train_target, clf.predict(train_data[features]) - ) - error_df2 = create_error_df( - test_data, test_target, clf.predict(test_data[features]) - ) - - error_features1 = [c for c in error_df1.columns if c not in ["errors"]] - error_features2 = [c for c in error_df2.columns if c not in ["errors"]] - - error_target1 = error_df1["errors"] - error_target2 = error_df2["errors"] - - error_clf = LGBMRegressor( - n_estimators=31, - colsample_bytree=1, - learning_rate=0.1, - metric="rmse", - max_depth=5, - random_state=42, - verbosity=2, - ) - - error_clf.fit( - error_df1[error_features1], - error_target1, - eval_set=[(error_df2[error_features2], error_target2)], - categorical_feature=["device_id", "site_id", "device_category"], - callbacks=[early_stopping(stopping_rounds=150)], - ) - - GCSUtils.upload_trained_model_to_gcs( - error_clf, project_id, bucket, f"{frequency}_error_model.pkl" - ) + # def create_error_df(data, target, preds): + # error_df = pd.DataFrame( + # { + # "actual_values": target, + # "predicted_values": preds, + # } + # ) + # error_df["errors"] = ( + # error_df["predicted_values"] - error_df["actual_values"] + # ) + # error_df = pd.concat([error_df, data], axis=1) + # error_df.drop(["actual_values", "pm2_5"], axis=1, inplace=True) + # error_df.rename(columns={"predicted_values": "pm2_5"}, inplace=True) + # + # return error_df + # + # error_df1 = create_error_df( + # train_data, train_target, clf.predict(train_data[features]) + # ) + # error_df2 = create_error_df( + # test_data, test_target, clf.predict(test_data[features]) + # ) + # + # error_features1 = [c for c in error_df1.columns if c not in ["errors"]] + # error_features2 = [c for c in error_df2.columns if c not in ["errors"]] + # + # error_target1 = error_df1["errors"] + # error_target2 = error_df2["errors"] + # + # error_clf = LGBMRegressor( + # n_estimators=31, + # colsample_bytree=1, + # learning_rate=0.1, + # metric="rmse", + # max_depth=5, + # random_state=42, + # verbosity=2, + # ) + # + # error_clf.fit( + # error_df1[error_features1], + # error_target1, + # eval_set=[(error_df2[error_features2], error_target2)], + # categorical_feature=["device_id", "site_id", "device_category"], + # callbacks=[early_stopping(stopping_rounds=150)], + # ) + # + # GCSUtils.upload_trained_model_to_gcs( + # error_clf, project_id, bucket, f"{frequency}_error_model.pkl" + # ) # TODO: quantile regression approach # alphas = [0.025, 0.975] @@ -432,12 +438,11 @@ def create_error_df(data, target, preds): def generate_forecasts(data, project_name, bucket_name, frequency): data["timestamp"] = pd.to_datetime(data["timestamp"]) data.columns = data.columns.str.strip() - data["margin_of_error"] = data["adjusted_forecast"] = 0 + # data["margin_of_error"] = data["adjusted_forecast"] = 0 def get_forecasts( df_tmp, forecast_model, - error_model, frequency, horizon, ): @@ -460,8 +465,6 @@ def get_forecasts( df_tmp_no_ts.shift(1, axis=0).rolling(s).agg(f) )["pm2_5"] - print("done") - elif frequency == "hourly": df_tmp.iloc[-1, df_tmp.columns.get_loc("timestamp")] = df_tmp.iloc[ -2, df_tmp.columns.get_loc("timestamp") @@ -480,7 +483,7 @@ def get_forecasts( for s in shifts2: for f in functions: df_tmp[f"pm2_5_{f}_{s}_hour"] = ( - df_tmp.shift(1, axis=0).rolling(s).agg(f) + df_tmp_no_ts.shift(1, axis=0).rolling(s).agg(f) )["pm2_5"] attributes = ["year", "month", "day", "dayofweek"] @@ -508,41 +511,41 @@ def get_forecasts( 2 * np.pi * df_tmp.tail(1)["timestamp"].dt.isocalendar().week / 52 ) - # make predictions excluded_columns = [ "pm2_5", "timestamp", - "margin_of_error", - "adjusted_forecast", + "latitude", + "longitude", + # "margin_of_error", + # "adjusted_forecast", ] - excluded_columns_2 = [ - "timestamp", - "margin_of_error", - "adjusted_forecast", - ] - print(df_tmp.tail(1)) + # excluded_columns_2 = [ + # "timestamp", + # "margin_of_error", + # "adjusted_forecast", + # ] df_tmp.loc[df_tmp.index[-1], "pm2_5"] = forecast_model.predict( df_tmp.drop(excluded_columns, axis=1).tail(1).values.reshape(1, -1) ) - df_tmp.loc[df_tmp.index[-1], "margin_of_error"] = error_model.predict( - df_tmp.drop(excluded_columns_2, axis=1) - .tail(1) - .values.reshape(1, -1) - ) - df_tmp.loc[df_tmp.index[-1], "adjusted_forecast"] = ( - df_tmp.loc[df_tmp.index[-1], "pm2_5"] - + df_tmp.loc[df_tmp.index[-1], "margin_of_error"] - ) - - return df_tmp.iloc[-int(horizon) :, :] + # df_tmp.loc[df_tmp.index[-1], "margin_of_error"] = error_model.predict( + # df_tmp.drop(excluded_columns_2, axis=1) + # .tail(1) + # .values.reshape(1, -1) + # ) + # df_tmp.loc[df_tmp.index[-1], "adjusted_forecast"] = ( + # df_tmp.loc[df_tmp.index[-1], "pm2_5"] + # + df_tmp.loc[df_tmp.index[-1], "margin_of_error"] + # ) + + return df_tmp.iloc[-int(horizon):, :] forecasts = pd.DataFrame() forecast_model = GCSUtils.get_trained_model_from_gcs( project_name, bucket_name, f"{frequency}_forecast_model.pkl" ) - error_model = GCSUtils.get_trained_model_from_gcs( - project_name, bucket_name, f"{frequency}_error_model.pkl" - ) + # error_model = GCSUtils.get_trained_model_from_gcs( + # project_name, bucket_name, f"{frequency}_error_model.pkl" + # ) df_tmp = data.copy() for device in df_tmp["device_id"].unique(): @@ -555,7 +558,6 @@ def get_forecasts( device_forecasts = get_forecasts( test_copy, forecast_model, - error_model, frequency, horizon, ) @@ -564,7 +566,7 @@ def get_forecasts( print(device) forecasts["pm2_5"] = forecasts["pm2_5"].astype(float) - forecasts["margin_of_error"] = forecasts["margin_of_error"].astype(float) + # forecasts["margin_of_error"] = forecasts["margin_of_error"].astype(float) DecodingUtils.decode_categorical_features_before_save(forecasts, frequency) forecasts = forecasts[ @@ -573,8 +575,8 @@ def get_forecasts( "site_id", "timestamp", "pm2_5", - "margin_of_error", - "adjusted_forecast", + # "margin_of_error", + # "adjusted_forecast", ] ] return forecasts @@ -586,8 +588,7 @@ def save_forecasts_to_mongo(data, frequency): forecast_results = [ { field: data[data["device_id"] == i][field].tolist()[0] - if field - not in ["pm2_5", "margin_of_error", "adjusted_forecast", "timestamp"] + if field not in ["pm2_5", "timestamp"] else data[data["device_id"] == i][field].tolist() for field in data.columns } @@ -597,12 +598,12 @@ def save_forecasts_to_mongo(data, frequency): client = pm.MongoClient(configuration.MONGO_URI) db = client[configuration.MONGO_DATABASE_NAME] if frequency == "hourly": - db.daily_forecasts.delete_many({}) + db.hourly_forecasts.delete_many({}) db.hourly_forecasts.insert_many(forecast_results) print(db.hourly_forecasts.find_one()) # confirm saving has worked elif frequency == "daily": db.daily_forecasts.delete_many({}) db.daily_forecasts.insert_many(forecast_results) - print(db.daily_forecasts.find_one()) + print(db.daily_forecasts_1.find_one()) else: raise ValueError("Invalid frequency argument") From fc2b8dddeb0246e3b23936476eb62343e84f17a8 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Wed, 6 Sep 2023 18:50:52 +0000 Subject: [PATCH 17/43] add bigquery method tests --- src/airflow/airqo_etl_utils/bigquery_api.py | 13 +- .../tests/big_query_api_tests.py | 150 ++++++++++++------ 2 files changed, 110 insertions(+), 53 deletions(-) diff --git a/src/airflow/airqo_etl_utils/bigquery_api.py b/src/airflow/airqo_etl_utils/bigquery_api.py index 1d720b6772..65317a0e77 100644 --- a/src/airflow/airqo_etl_utils/bigquery_api.py +++ b/src/airflow/airqo_etl_utils/bigquery_api.py @@ -619,6 +619,10 @@ def fetch_data( self, start_date_time: str, ) -> pd.DataFrame: + try: + pd.to_datetime(start_date_time) + except ValueError: + raise ValueError(f"Invalid start date time: {start_date_time}") query = f""" SELECT DISTINCT t1.device_id, @@ -637,8 +641,13 @@ def fetch_data( job_config = bigquery.QueryJobConfig() job_config.use_query_cache = True - df = self.client.query(f"{query}", job_config).result().to_dataframe() - return df + try: + df = self.client.query(query, job_config).result().to_dataframe() + return df + except Exception as e: + print("Error fetching data from bigquery") + + @staticmethod def save_forecasts_to_bigquery(df, table): diff --git a/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py b/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py index e519ad033d..a2d00f4b71 100644 --- a/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py +++ b/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py @@ -1,4 +1,6 @@ # Import pytest and other modules as needed +from unittest import mock + import pandas as pd import pytest @@ -6,7 +8,7 @@ @pytest.fixture -def mock_bigquery_client(mocker): +def mock_bigquery_client1(mocker): mock_client = mocker.Mock() mock_client.query.return_value.result.return_value.to_dataframe.return_value = ( pd.DataFrame( @@ -21,66 +23,112 @@ def mock_bigquery_client(mocker): return mock_client +@pytest.fixture +def mock_bigquery_client2(): + """A fixture that mocks the bigquery.Client object.""" + + fake_client = mock.Mock() + + sample_df = pd.DataFrame( + { + "device_id": ["A", "A", "B", "B"], + "timestamp": [ + "2023-01-01 00:00:00", + "2023-01-01 01:00:00", + "2023-01-01 00:00:00", + "2023-01-01 01:00:00", + ], + "site_id": [1, 1, 2, 2], + "pm2_5": [10.0, 12.0, 15.0, 18.0], + "latitude": [10.0, 10.0, 20.0, 20.0], + "longitude": [10.0, 10.0, 20.0, 20.0], + "device_category": ["A", "A", "B", "B"], + } + ) + + fake_data_empty_result = pd.DataFrame() + + fake_error = "Fake error" + + def fake_query(query, job_config): + fake_job = mock.Mock() + + if "2023-01-01" in query: + fake_job.result.return_value.to_dataframe.return_value = ( + sample_df + ) + elif "2023-01-02" in query: + fake_job.result.return_value.to_dataframe.return_value = ( + fake_data_empty_result + ) + elif "2023-01-03" in query: + fake_job.result.side_effect = fake_error + else: + raise ValueError("Invalid date") + + return fake_job + + fake_client.query.side_effect = fake_query + + return fake_client + @pytest.mark.parametrize( - "method", + "start_date_time, expected_df", [ - BigQueryApi.fetch_hourly_forecast_training_data, - BigQueryApi.fetch_daily_forecast_training_data, + ( + "2023-01-01", + pd.DataFrame( + { + "device_id": ["A", "A", "B", "B"], + "timestamp": [ + "2023-01-01 00:00:00", + "2023-01-01 01:00:00", + "2023-01-01 00:00:00", + "2023-01-01 01:00:00", + ], + "site_id": [1, 1, 2, 2], + "pm2_5": [10.0, 12.0, 15.0, 18.0], + "latitude": [10.0, 10.0, 20.0, 20.0], + "longitude": [10.0, 10.0, 20.0, 20.0], + "device_category": ["A", "A", "B", "B"], + } + ), + ), + ("2023-01-02", pd.DataFrame()), ], ) -def test_fetch_data_columns(method, mock_bigquery_client): - api = BigQueryApi() - api.client = mock_bigquery_client - df = method(api) - assert list(df.columns) == ["created_at", "device_number", "pm2_5"] - assert isinstance(df, pd.DataFrame) - assert not df.empty +def test_fetch_data_correct_se(mock_bigquery_client2, start_date_time, expected_df): + """Tests the fetch_data method for the happy path scenarios.""" -def test_fetch_hourly_forecast_training_data_exception(mock_bigquery_client): - api = BigQueryApi() - api.client = mock_bigquery_client - api.client.query.side_effect = Exception("Bigquery error") - with pytest.raises(Exception) as e: - df = api.fetch_hourly_forecast_training_data() - assert "Bigquery error" in str(e.value) + bq_api = BigQueryApi() + bq_api.client = mock_bigquery_client2 + actual_df = bq_api.fetch_data(start_date_time) + pd.testing.assert_frame_equal(actual_df, expected_df) -def test_fetch_hourly_forecast_training_data_null(): - api = BigQueryApi() - api.client = mock_bigquery_client() - api.client.query.return_value.result.return_value.to_dataframe.return_value = ( - pd.DataFrame( - { - "created_at": ["2021-01-01 00:00:00", "2021-01-01 01:00:00"], - "device_number": [1, 2], - "pm2_5": [None, None], - } - ) - ) - with pytest.raises(Exception) as e: - df = api.fetch_hourly_forecast_training_data() - assert "pm2_5 column cannot be null" in str(e.value) +@pytest.mark.parametrize("start_date_time", ["2023-13-01", "2023-01-32", "invalid"]) +def test_fetch_data_invalid_date(mock_bigquery_client2, start_date_time): + """Tests the fetch_data method for the scenario where an invalid date string is passed.""" + + bq_api = BigQueryApi() + bq_api.client = mock_bigquery_client2 + + with pytest.raises(ValueError): + bq_api.fetch_data(start_date_time) + +@pytest.mark.parametrize("start_date_time", ["2023-01-03"]) +def test_fetch_data_bigquery_error(mock_bigquery_client2, start_date_time): + """Tests the fetch_data method for the scenario where a bigquery.GoogleAPIError is raised.""" + + # Create an instance of BigQueryApi with the mocked client + bq_api = BigQueryApi() + bq_api.client = mock_bigquery_client2 + + with pytest.raises(Exception): + bq_api.fetch_data(start_date_time) -def test_fetch_daily_forecast_training_data_date_range(mock_bigquery_client): - api = BigQueryApi() - api.client = mock_bigquery_client - api.client.query.return_value.result.return_value.to_dataframe.return_value = ( - pd.DataFrame( - { - "created_at": [ - "2020-01-01 00:00:00", - "2020-06-01 00:00:00", - "2020-12-01 00:00:00", - ], - "device_number": [1, 2, 3], - "pm2_5": [10, 20, 30], - } - ) - ) - df = api.fetch_daily_forecast_training_data() - assert df["created_at"].min() >= pd.Timestamp.now() - pd.DateOffset(months=12) def test_fetch_raw_readings_empty(mock_bigquery_client): From 545be08bec375619a500b5b096608ab534b96369 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Thu, 14 Sep 2023 11:17:10 +0300 Subject: [PATCH 18/43] add preprocessing tests --- src/airflow/airqo_etl_utils/ml_utils.py | 14 +++- .../tests/airqo_utils_tests.py | 1 - .../tests/big_query_api_tests.py | 2 +- src/airflow/airqo_etl_utils/tests/conftest.py | 78 +++---------------- .../airqo_etl_utils/tests/ml_utils_tests.py | 44 ++++------- 5 files changed, 40 insertions(+), 99 deletions(-) diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index 630d936b90..095a3caf0d 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -14,7 +14,6 @@ from .config import configuration -fixed_columns = ["site_id"] project_id = configuration.GOOGLE_CLOUD_PROJECT_ID bucket = configuration.FORECAST_MODELS_BUCKET environment = configuration.ENVIRONMENT @@ -126,7 +125,18 @@ def encode_categorical_training_features(df, freq): class ForecastUtils: @staticmethod def preprocess_data(data, data_frequency): - data["timestamp"] = pd.to_datetime(data["timestamp"]) + required_columns = {"device_id", "site_id", "device_category", "pm2_5", "timestamp"} + if not required_columns.issubset(data.columns): + missing_columns = required_columns.difference(data.columns) + raise ValueError( + f"Provided dataframe missing necessary columns: {', '.join(missing_columns)}" + ) + try: + data["timestamp"] = pd.to_datetime(data["timestamp"]) + except ValueError as e: + raise ValueError( + "datetime conversion error, please provide timestamp in valid format" + ) data["pm2_5"] = data.groupby(["device_id", "site_id", "device_category"])[ "pm2_5" ].transform(lambda x: x.interpolate(method="linear", limit_direction="both")) diff --git a/src/airflow/airqo_etl_utils/tests/airqo_utils_tests.py b/src/airflow/airqo_etl_utils/tests/airqo_utils_tests.py index 6e3792f74e..0d1541df67 100644 --- a/src/airflow/airqo_etl_utils/tests/airqo_utils_tests.py +++ b/src/airflow/airqo_etl_utils/tests/airqo_utils_tests.py @@ -10,7 +10,6 @@ from airqo_etl_utils.tests.conftest import FaultDetectionFixtures -# TODO: Convert to pytest class TestAirQoDataUtils(unittest.TestCase): def test_map_site_ids_to_historical_data(self): logs = pd.DataFrame( diff --git a/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py b/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py index a2d00f4b71..da4a20e4f4 100644 --- a/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py +++ b/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py @@ -99,7 +99,7 @@ def fake_query(query, job_config): ) def test_fetch_data_correct_se(mock_bigquery_client2, start_date_time, expected_df): - """Tests the fetch_data method for the happy path scenarios.""" + """Tests the fetch_data method for scenarios when correct data is retrieved.""" bq_api = BigQueryApi() bq_api.client = mock_bigquery_client2 diff --git a/src/airflow/airqo_etl_utils/tests/conftest.py b/src/airflow/airqo_etl_utils/tests/conftest.py index 8ec08842b2..e63ec4dab8 100644 --- a/src/airflow/airqo_etl_utils/tests/conftest.py +++ b/src/airflow/airqo_etl_utils/tests/conftest.py @@ -1,7 +1,7 @@ -import numpy as np +from datetime import datetime + import pandas as pd import pytest -from datetime import datetime def pytest_configure(config): @@ -13,75 +13,17 @@ def pytest_configure(config): class ForecastFixtures: @staticmethod @pytest.fixture(scope="session") - def hourly_data(): - return pd.DataFrame( - { - "device_number": [1, 1, 1, 2, 2, 2], - "created_at": [ - "2021-08-01 00:00:00", - "2021-08-01 01:00:00", - "2021-08-01 02:00:00", - "2021-08-01 00:00:00", - "2021-08-01 01:00:00", - "2021-08-01 02:00:00", - ], - "pm2_5": [10.0, np.nan, 12.0, 15.0, np.nan, np.nan], - } - ) - - @staticmethod - @pytest.fixture(scope="session") - def daily_data(): - return pd.DataFrame( - { - "device_number": [1, 1, 1, 2, 2, 2], - "created_at": [ - "2021-08-01 00:00:00", - "2021-08-02 00:00:00", - "2021-08-03 00:00:00", - "2021-08-01 00:00:00", - "2021-08-02 00:00:00", - "2021-08-03 00:00:00", - ], - "pm2_5": [10.0, np.nan, 12.0, 15.0, np.nan, np.nan], - } - ) - - @staticmethod - @pytest.fixture(scope="session") - def hourly_output(): - return pd.DataFrame( - { - "device_number": [1, 1, 1, 2, 2, 2], - "created_at": [ - "2021-08-01 00:00:00", - "2021-08-01 01:00:00", - "2021-08-01 02:00:00", - "2021-08-01 00:00:00", - "2021-08-01 01:00:00", - "2021-08-01 02:00:00", - ], - "pm2_5": [10.0, 11.0, 12.0, 15.0, 16.0, 17.0], - } - ) - - @staticmethod - @pytest.fixture(scope="session") - def daily_output(): - return pd.DataFrame( + def example_data(): + data = pd.DataFrame( { - "device_number": [1, 1, 1, 2, 2, 2], - "created_at": [ - "2021-08-01 00:00:00", - "2021-08-02 00:00:00", - "2021-08-03 00:00:00", - "2021-08-01 00:00:00", - "2021-08-02 00:00:00", - "2021-08-03 00:00:00", - ], - "pm2_5": [10.0, 11.0, 12.0, 15.0, 16.0, 17.0], + "device_id": ["A", "B"], + "site_id": ["X", "Y"], + "device_category": ["LOWCOST", "BAM"], + "pm2_5": [1, 2], + "timestamp": ["2023-01-01", "2023-02-01"], } ) + return data @pytest.fixture(scope="session") diff --git a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py index e28e592cea..0e707711c4 100644 --- a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py +++ b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py @@ -1,32 +1,22 @@ -# TODO: Add tests for ml_utils.py +import pytest -import pandas as pd - -from airqo_etl_utils.ml_utils import ForecastUtils +from airqo_etl_utils.ml_utils import ForecastUtils as FUtils from airqo_etl_utils.tests.conftest import ForecastFixtures -class ForecastTests(ForecastFixtures): - def test_preprocess_hourly_training_data(self, hourly_data, hourly_output): - assert isinstance( - ForecastUtils.preprocess_hourly_training_data(hourly_data), pd.DataFrame - ) - assert ( - ForecastUtils.preprocess_hourly_training_data(hourly_data).shape[0] - == hourly_output.shape[0] - ) - assert ForecastUtils.preprocess_hourly_training_data(hourly_data)[ - "pm2_5" - ].equals(hourly_output["pm2_5"]) +class TestsForecasts(ForecastFixtures): + def test_preprocess_data_typical_case(self, example_data): + result = FUtils.preprocess_data(example_data, "daily") + assert "pm2_5" in result.columns + + def test_preprocess_data_invalid_input(self, example_data): + df = example_data.drop(columns=["device_id"]) + with pytest.raises(ValueError): + FUtils.preprocess_data(df, "daily") - def test_preprocess_daily_training_data(self, daily_data, daily_output): - assert isinstance( - ForecastUtils.preprocess_daily_training_data(daily_data), pd.DataFrame - ) - assert ( - ForecastUtils.preprocess_daily_training_data(daily_data).shape[0] - == daily_output.shape[0] - ) - assert ForecastUtils.preprocess_daily_training_data(daily_data)["pm2_5"].equals( - daily_output["pm2_5"] - ) + def test_preprocess_data_invalid_timestamp(self, example_data): + # Invalid timestamp + df = example_data.copy() + df["timestamp"] = "invalid" + with pytest.raises(ValueError): + FUtils.preprocess_data(df, "daily") \ No newline at end of file From 15909c150468ca7844235a0f2e0748ba6ca2acf5 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Thu, 14 Sep 2023 22:00:02 +0300 Subject: [PATCH 19/43] add feature engineering tests --- src/airflow/airqo_etl_utils/ml_utils.py | 187 ++++++++++-------- src/airflow/airqo_etl_utils/tests/conftest.py | 24 ++- .../airqo_etl_utils/tests/ml_utils_tests.py | 83 +++++++- src/airflow/dags/ml_training_jobs.py | 49 ++++- 4 files changed, 239 insertions(+), 104 deletions(-) diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index 095a3caf0d..f35116c014 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -100,6 +100,7 @@ def decode_categorical_features_before_save(df, frequency): @staticmethod def encode_categorical_training_features(df, freq): + df["timestamp"] = pd.to_datetime("timestamp") df1 = df.copy() columns = ["device_id", "site_id", "device_category"] mappings = [] @@ -155,102 +156,118 @@ def preprocess_data(data, data_frequency): data = data.dropna(subset=["pm2_5"]) return data + @staticmethod - def feature_eng_data(data, target_column, data_frequency, job_type): - def get_lag_features(df, target_col, freq): - df1 = df.copy() # use copy to prevent terminal warning - if freq == "daily": - shifts = [1, 2, 3, 7, 14] - for s in shifts: - df1[f"pm2_5_last_{s}_day"] = df1.groupby(["device_id"])[ - target_col - ].shift(s) - - shifts = [2, 3, 7, 14] - functions = ["mean", "std", "max", "min"] - for s in shifts: - for f in functions: - df1[f"pm2_5_{f}_{s}_day"] = ( - df1.groupby(["device_id"])[target_col] - .shift(1) - .rolling(s) - .agg(f) - ) - elif freq == "hourly": - shifts = [1, 2, 6, 12] - for s in shifts: - df1[f"pm2_5_last_{s}_hour"] = df1.groupby(["device_id"])[ - target_col - ].shift(s) - - shifts = [3, 6, 12, 24] - functions = ["mean", "std", "median", "skew"] - for s in shifts: - for f in functions: - df1[f"pm2_5_{f}_{s}_hour"] = ( - df1.groupby(["device_id"])[target_col] - .shift(1) - .rolling(s) - .agg(f) - ) - else: - raise ValueError("Invalid frequency") - - return df1 - - def get_time_and_cyclic_features(df, freq): - df1 = df.copy() - attributes = ["year", "month", "day", "dayofweek"] - max_vals = [2023, 12, 30, 7] - if freq == "hourly": - attributes.append("hour") - max_vals.append(23) - for a, m in zip(attributes, max_vals): - df1[a] = df1["timestamp"].dt.__getattribute__(a) - df1[a + "_sin"] = np.sin(2 * np.pi * df1[a] / m) - df1[a + "_cos"] = np.cos(2 * np.pi * df1[a] / m) - - df1["week"] = df1["timestamp"].dt.isocalendar().week - df1["week_sin"] = np.sin(2 * np.pi * df1["week"] / 52) - df1["week_cos"] = np.cos(2 * np.pi * df1["week"] / 52) - df1.drop(columns=attributes + ["week"], inplace=True) - return df1 - - def get_location_cord(df): - df["x_cord"] = np.cos(df["latitude"]) * np.cos(df["longitude"]) - df["y_cord"] = np.cos(df["latitude"]) * np.sin(df["longitude"]) - df["z_cord"] = np.sin(df["latitude"]) + def get_lag_and_roll_features(df, target_col, freq): + if df.empty: + raise ValueError("Empty dataframe provided") + + if target_col not in df.columns or "timestamp" not in df.columns or "device_id" not in df.columns: + raise ValueError("Required columns missing") + + df["timestamp"] = pd.to_datetime(df["timestamp"]) - return df + df1 = df.copy() # use copy to prevent terminal warning + if freq == "daily": + shifts = [1, 2, 3, 7, 14] + for s in shifts: + df1[f"pm2_5_last_{s}_day"] = df1.groupby(["device_id"])[ + target_col + ].shift(s) + shifts = [2, 3, 7, 14] + functions = ["mean", "std", "max", "min"] + for s in shifts: + for f in functions: + df1[f"pm2_5_{f}_{s}_day"] = ( + df1.groupby(["device_id"])[target_col] + .shift(1) + .rolling(s) + .agg(f) + ) + elif freq == "hourly": + shifts = [1, 2, 6, 12] + for s in shifts: + df1[f"pm2_5_last_{s}_hour"] = df1.groupby(["device_id"])[ + target_col + ].shift(s) + shifts = [3, 6, 12, 24] + functions = ["mean", "std", "median", "skew"] + for s in shifts: + for f in functions: + df1[f"pm2_5_{f}_{s}_hour"] = ( + df1.groupby(["device_id"])[target_col] + .shift(1) + .rolling(s) + .agg(f) + ) + else: + raise ValueError("Invalid frequency") + return df1 - df_tmp = data.copy() - df_tmp["timestamp"] = pd.to_datetime(df_tmp["timestamp"]) - df_tmp = get_lag_features(df_tmp, target_column, data_frequency) - df_tmp = get_time_and_cyclic_features(df_tmp, data_frequency) - df_tmp = get_location_cord(df_tmp) - if job_type == "train": - df_tmp = DecodingUtils.encode_categorical_training_features( - df_tmp, data_frequency - ) - elif job_type == "predict": - df_tmp = DecodingUtils.decode_categorical_features_pred( - df_tmp, data_frequency - ) - df_tmp.dropna( - subset=["device_id", "site_id", "device_category"], inplace=True - ) # only 1 row, not sure why + @staticmethod + def get_time_and_cyclic_features(df, freq): - df_tmp["device_id"] = df_tmp["device_id"].astype(int) - df_tmp["site_id"] = df_tmp["site_id"].astype(int) - df_tmp["device_category"] = df_tmp["device_category"].astype(int) + df["timestamp"] = pd.to_datetime(df["timestamp"]) + df1 = df.copy() + attributes = ["year", "month", "day", "dayofweek"] + max_vals = [2023, 12, 30, 7] + if freq == "hourly": + attributes.append("hour") + max_vals.append(23) + for a, m in zip(attributes, max_vals): + df1[a] = df1["timestamp"].dt.__getattribute__(a) + df1[a + "_sin"] = np.sin(2 * np.pi * df1[a] / m) + df1[a + "_cos"] = np.cos(2 * np.pi * df1[a] / m) + + df1["week"] = df1["timestamp"].dt.isocalendar().week + df1["week_sin"] = np.sin(2 * np.pi * df1["week"] / 52) + df1["week_cos"] = np.cos(2 * np.pi * df1["week"] / 52) + df1.drop(columns=attributes + ["week"], inplace=True) + return df1 + + @staticmethod + def get_location_features(df): + df["timestamp"] = pd.to_datetime(df) + df["x_cord"] = np.cos(df["latitude"]) * np.cos(df["longitude"]) + df["y_cord"] = np.cos(df["latitude"]) * np.sin(df["longitude"]) + df["z_cord"] = np.sin(df["latitude"]) + + return df - return df_tmp + # df_tmp = get_lag_features(df_tmp, target_column, data_frequency) + # df_tmp = get_time_and_cyclic_features(df_tmp, data_frequency) + # df_tmp = get_location_cord(df_tmp) + # if job_type == "train": + # df_tmp = DecodingUtils.encode_categorical_training_features( + # df_tmp, data_frequency + # ) + # elif job_type == "predict": + # df_tmp = DecodingUtils.decode_categorical_features_pred( + # df_tmp, data_frequency + # ) + # df_tmp.dropna( + # subset=["device_id", "site_id", "device_category"], inplace=True + # ) # only 1 row, not sure why + # + # df_tmp["device_id"] = df_tmp["device_id"].astype(int) + # df_tmp["site_id"] = df_tmp["site_id"].astype(int) + # df_tmp["device_category"] = df_tmp["device_category"].astype(int) + # + # return df_tmp @staticmethod def train_and_save_forecast_models(training_data, frequency): """ Perform the actual training for hourly data """ + training_data.dropna( + subset=["device_id", "site_id", "device_category"], inplace=True + ) + + training_data["device_id"] = training_data["device_id"].astype(int) + training_data["site_id"] = training_data["site_id"].astype(int) + training_data["device_category"] = training_data["device_category"].astype(int) + training_data["timestamp"] = pd.to_datetime(training_data["timestamp"]) features = [ c diff --git a/src/airflow/airqo_etl_utils/tests/conftest.py b/src/airflow/airqo_etl_utils/tests/conftest.py index e63ec4dab8..17433693da 100644 --- a/src/airflow/airqo_etl_utils/tests/conftest.py +++ b/src/airflow/airqo_etl_utils/tests/conftest.py @@ -13,7 +13,7 @@ def pytest_configure(config): class ForecastFixtures: @staticmethod @pytest.fixture(scope="session") - def example_data(): + def preprocessing_sample_df(): data = pd.DataFrame( { "device_id": ["A", "B"], @@ -26,6 +26,28 @@ def example_data(): return data + @staticmethod + @pytest.fixture + def feat_eng_sample_df_daily(): + data = { + "timestamp": pd.date_range(end = pd.Timestamp.now(), periods=365).tolist(), + "device_id": ["device1"] * 365, + "pm2_5": range(1, 366), + } + return pd.DataFrame(data) + + @staticmethod + @pytest.fixture + def feat_eng_sample_df_hourly(): + data = { + "timestamp": pd.date_range(end = pd.Timestamp.now(), periods=24*14, freq='H').tolist(), + "device_id": ["device1"] * 24*14, + "pm2_5": range(1, 24*14+1), + } + return pd.DataFrame(data) + + + @pytest.fixture(scope="session") def mongo_fixture(): from airqo_etl_utils.mongo_client import MongoClient diff --git a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py index 0e707711c4..7b56aa2a55 100644 --- a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py +++ b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py @@ -1,3 +1,4 @@ +import pandas as pd import pytest from airqo_etl_utils.ml_utils import ForecastUtils as FUtils @@ -5,18 +6,84 @@ class TestsForecasts(ForecastFixtures): - def test_preprocess_data_typical_case(self, example_data): - result = FUtils.preprocess_data(example_data, "daily") + # Preprocess data tests + def test_preprocess_data_typical_case(self, preprocessing_sample_df): + result = FUtils.preprocess_data(preprocessing_sample_df, "daily") assert "pm2_5" in result.columns - def test_preprocess_data_invalid_input(self, example_data): - df = example_data.drop(columns=["device_id"]) + def test_preprocess_data_invalid_input(self, preprocessing_sample_df): + df = preprocessing_sample_df.drop(columns=["device_id"]) with pytest.raises(ValueError): FUtils.preprocess_data(df, "daily") - def test_preprocess_data_invalid_timestamp(self, example_data): - # Invalid timestamp - df = example_data.copy() + def test_preprocess_data_invalid_timestamp(self, preprocessing_sample_df): + df = preprocessing_sample_df.copy() df["timestamp"] = "invalid" with pytest.raises(ValueError): - FUtils.preprocess_data(df, "daily") \ No newline at end of file + FUtils.preprocess_data(df, "daily") + + # Feature engineering tests + # get_lag_and_rolling_features tests + + def test_empty_df(self): + with pytest.raises(ValueError, match="Empty dataframe provided"): + FUtils.get_lag_and_roll_features(pd.DataFrame(), "pm2_5", "daily") + + def test_missing_columns(self, feat_eng_sample_df_daily): + del feat_eng_sample_df_daily[ + "device_id" + ] # Test for case where 'device_id' is missing + with pytest.raises(ValueError, match="Required columns missing"): + FUtils.get_lag_and_roll_features(feat_eng_sample_df_daily, "pm2_5", "daily") + + def test_invalid_frequency(self, feat_eng_sample_df_daily): + with pytest.raises(ValueError, match="Invalid frequency"): + FUtils.get_lag_and_roll_features( + feat_eng_sample_df_daily, "pm2_5", "annually" + ) + + def test_hourly_freq(self, sample_hourly_dataframe): + hourly_df = FUtils.get_lag_and_roll_features( + sample_hourly_dataframe, "pm2_5", "hourly" + ) + for s in [1, 2, 6, 12]: + assert f"pm2_5_last_{s}_hour" in hourly_df.columns + for s in [3, 6, 12, 24]: + for f in ["mean", "std", "median", "skew"]: + assert f"pm2_5_{f}_{s}_hour" in hourly_df.columns + + def test_daily_freq(self, feat_eng_sample_df_daily): + daily_df = FUtils.get_lag_and_roll_features( + feat_eng_sample_df_daily, "pm2_5", "daily" + ) + for s in [1, 2, 3, 7, 14]: + assert f"pm2_5_last_{s}_day" in daily_df.columns + for s in [2, 3, 7, 14]: + for f in ["mean", "std", "max", "min"]: + assert f"pm2_5_{f}_{s}_day" in daily_df.columns + + def test_empty_df_for_time_and_cyclic_features(self): + with pytest.raises(ValueError, match="Empty dataframe provided"): + FUtils.get_time_and_cyclic_features(pd.DataFrame(), "daily") + + def test_missing_columns_for_time_and_cyclic_features(self, feat_eng_sample_df_daily): + with pytest.raises(ValueError, match="Required columns missing"): + FUtils.get_time_and_cyclic_features(feat_eng_sample_df_daily, "daily") + + def test_invalid_frequency_for_time_and_cyclic_features(self, feat_eng_sample_df_daily): + with pytest.raises(ValueError, match="Invalid frequency"): + FUtils.get_time_and_cyclic_features(feat_eng_sample_df_daily, "annually") + +# For 'daily' frequency + def test_daily_freq_for_time_and_cyclic_features(self, feat_eng_sample_df_daily): + daily_df = FUtils.get_time_and_cyclic_features(feat_eng_sample_df_daily, "daily") + for a in ["year", "month", "day", "dayofweek", "week"]: + for t in ["_sin", "_cos"]: + assert f"{a}{t}" in daily_df.columns + +# For 'hourly' frequency + def test_hourly_freq_for_time_and_cyclic_features(self, feat_eng_sample_df_hourly): + hourly_df = FUtils.get_time_and_cyclic_features(feat_eng_sample_df_hourly, "hourly") + for a in ["year", "month", "day", "dayofweek", "hour", "week"]: + for t in ["_sin", "_cos"]: + assert f"{a}{t}" in hourly_df.columns diff --git a/src/airflow/dags/ml_training_jobs.py b/src/airflow/dags/ml_training_jobs.py index a517d6a3a7..e5822d12cc 100644 --- a/src/airflow/dags/ml_training_jobs.py +++ b/src/airflow/dags/ml_training_jobs.py @@ -1,12 +1,13 @@ +from datetime import datetime + from airflow.decorators import dag, task +from dateutil.relativedelta import relativedelta from airqo_etl_utils.airflow_custom_utils import AirflowUtils +from airqo_etl_utils.bigquery_api import BigQueryApi from airqo_etl_utils.config import configuration -from airqo_etl_utils.ml_utils import ForecastUtils from airqo_etl_utils.date import date_to_str -from dateutil.relativedelta import relativedelta -from airqo_etl_utils.bigquery_api import BigQueryApi -from datetime import datetime +from airqo_etl_utils.ml_utils import ForecastUtils, DecodingUtils @dag( @@ -32,10 +33,21 @@ def preprocess_training_data_for_hourly_forecast_model(data): return ForecastUtils.preprocess_data(data, "hourly") @task() - def feat_engineer_training_data_for_hourly_forecast_model(data): - return ForecastUtils.feature_eng_data(data, "pm2_5", "hourly", "train") + def get_hourly_lag_and_rolling_features(data): + return ForecastUtils.get_lag_and_roll_features(data, 'pm2_5', 'hourly') + + @task() + def get_hourly_time_and_cyclic_features(data): + return ForecastUtils.get_time_and_cyclic_features(data, 'hourly') @task() + def get_location_features(data): + return ForecastUtils.get_location_features(data) + + @task() + def encode_categorical_features(data): + return DecodingUtils.encode_categorical_training_features(data, 'daily') + @task() def train_and_save_hourly_forecast_model(train_data): return ForecastUtils.train_and_save_forecast_models( train_data, frequency="hourly" @@ -59,21 +71,38 @@ def preprocess_training_data_for_daily_forecast_model(data): return ForecastUtils.preprocess_data(data, "daily") @task() - def feature_engineer_data_for_daily_forecast_model(data): - return ForecastUtils.feature_eng_data(data, "pm2_5", "daily", "train") + def get_daily_lag_and_rolling_features(data): + return ForecastUtils.get_lag_and_roll_features(data, "pm2_5", "daily") + + @task() + def get_daily_time_and_cylic_features(data): + return ForecastUtils.get_time_and_cyclic_features(data, 'daily') + + @task() + def get_location_features(data): + return ForecastUtils.get_location_features(data) @task() + def encode_categorical_features(data): + return DecodingUtils.encode_categorical_training_features(data, 'daily') + @task() def train_and_save_daily_model(train_data): return ForecastUtils.train_and_save_forecast_models(train_data, "daily") hourly_data = fetch_training_data_for_hourly_forecast_model() hourly_data = preprocess_training_data_for_hourly_forecast_model(hourly_data) - hourly_data = feat_engineer_training_data_for_hourly_forecast_model(hourly_data) + hourly_data = get_hourly_lag_and_rolling_features(hourly_data) + hourly_data = get_hourly_time_and_cyclic_features(hourly_data) + hourly_data = get_location_features(hourly_data) + hourly_data = encode_categorical_features(hourly_data) train_and_save_hourly_forecast_model(hourly_data) daily_data = fetch_training_data_for_daily_forecast_model() daily_data = preprocess_training_data_for_daily_forecast_model(daily_data) - daily_data = feature_engineer_data_for_daily_forecast_model(daily_data) + daily_data = get_daily_lag_and_rolling_features(daily_data) + daily_data = get_daily_time_and_cylic_features(daily_data) + daily_data = get_location_features(daily_data) + daily_data = encode_categorical_features(daily_data) train_and_save_daily_model(daily_data) From f18da9b5506bad7d4fd3b35d03f8240064032de4 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Sat, 16 Sep 2023 13:15:05 +0300 Subject: [PATCH 20/43] training job cleanup --- src/airflow/airqo_etl_utils/ml_utils.py | 31 ++++++++++--- src/airflow/airqo_etl_utils/tests/conftest.py | 15 ++++++- .../airqo_etl_utils/tests/ml_utils_tests.py | 43 +++++++++++++++++++ src/airflow/dev-requirements.txt | 2 +- 4 files changed, 83 insertions(+), 8 deletions(-) diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index f35116c014..3f541662f7 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -22,6 +22,7 @@ class GCSUtils: + """ Utility class for saving and retrieving models from GCS""" # TODO: In future, save and retrieve models from mlflow instead of GCS @staticmethod def get_trained_model_from_gcs(project_name, bucket_name, source_blob_name): @@ -66,6 +67,7 @@ def get_mapping_from_gcs(project_name, bucket_name, source_blob_name): class DecodingUtils: + """ Utility class for encoding and decoding categorical features""" @staticmethod def decode_categorical_features_pred(df, frequency): columns = ["device_id", "site_id", "device_category"] @@ -100,7 +102,7 @@ def decode_categorical_features_before_save(df, frequency): @staticmethod def encode_categorical_training_features(df, freq): - df["timestamp"] = pd.to_datetime("timestamp") + df["timestamp"] = pd.to_datetime(df["timestamp"]) df1 = df.copy() columns = ["device_id", "site_id", "device_category"] mappings = [] @@ -169,12 +171,12 @@ def get_lag_and_roll_features(df, target_col, freq): df1 = df.copy() # use copy to prevent terminal warning if freq == "daily": - shifts = [1, 2, 3, 7, 14] + shifts = [1, 2, 3, 7] for s in shifts: df1[f"pm2_5_last_{s}_day"] = df1.groupby(["device_id"])[ target_col ].shift(s) - shifts = [2, 3, 7, 14] + shifts = [2, 3, 7] functions = ["mean", "std", "max", "min"] for s in shifts: for f in functions: @@ -206,7 +208,16 @@ def get_lag_and_roll_features(df, target_col, freq): @staticmethod def get_time_and_cyclic_features(df, freq): + if df.empty: + raise ValueError("Empty dataframe provided") + + if "timestamp" not in df.columns: + raise ValueError("Required columns missing") + + df["timestamp"] = pd.to_datetime(df["timestamp"]) + if freq not in ["daily", "hourly"]: + raise ValueError("Invalid frequency") df["timestamp"] = pd.to_datetime(df["timestamp"]) df1 = df.copy() attributes = ["year", "month", "day", "dayofweek"] @@ -227,7 +238,15 @@ def get_time_and_cyclic_features(df, freq): @staticmethod def get_location_features(df): - df["timestamp"] = pd.to_datetime(df) + if df.empty: + raise ValueError("Empty dataframe provided") + + for column_name in ["timestamp", "latitude", "longitude"]: + if column_name not in df.columns: + raise ValueError(f"{column_name} column is missing") + + df["timestamp"] = pd.to_datetime(df["timestamp"]) + df["x_cord"] = np.cos(df["latitude"]) * np.cos(df["longitude"]) df["y_cord"] = np.cos(df["latitude"]) * np.sin(df["longitude"]) df["z_cord"] = np.sin(df["latitude"]) @@ -480,11 +499,11 @@ def get_forecasts( # daily frequency if frequency == "daily": df_tmp.tail(1)["timestamp"] += timedelta(days=1) - shifts1 = [1, 2, 3, 7, 14] + shifts1 = [1, 2, 3, 7] for s in shifts1: df_tmp[f"pm2_5_last_{s}_day"] = df_tmp.shift(s, axis=0)["pm2_5"] # rolling features - shifts2 = [2, 3, 7, 14] + shifts2 = [2, 3, 7] functions = ["mean", "std", "max", "min"] for s in shifts2: for f in functions: diff --git a/src/airflow/airqo_etl_utils/tests/conftest.py b/src/airflow/airqo_etl_utils/tests/conftest.py index 17433693da..4f24ccd45e 100644 --- a/src/airflow/airqo_etl_utils/tests/conftest.py +++ b/src/airflow/airqo_etl_utils/tests/conftest.py @@ -1,5 +1,6 @@ from datetime import datetime +import numpy as np import pandas as pd import pytest @@ -10,6 +11,8 @@ def pytest_configure(config): ) + + class ForecastFixtures: @staticmethod @pytest.fixture(scope="session") @@ -46,7 +49,17 @@ def feat_eng_sample_df_hourly(): } return pd.DataFrame(data) - + @staticmethod + @pytest.fixture + def sample_dataframe_for_location_features(): + data = { + "timestamp": pd.date_range(end=pd.Timestamp.now(), periods=100) + .tolist(), + "device_id": ["device1"] * 100, + "latitude": np.random.uniform(-90, 90, 100), + "longitude": np.random.uniform(-180, 180, 100), + } + return pd.DataFrame(data) @pytest.fixture(scope="session") def mongo_fixture(): diff --git a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py index 7b56aa2a55..ec7128d9e6 100644 --- a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py +++ b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py @@ -87,3 +87,46 @@ def test_hourly_freq_for_time_and_cyclic_features(self, feat_eng_sample_df_hourl for a in ["year", "month", "day", "dayofweek", "hour", "week"]: for t in ["_sin", "_cos"]: assert f"{a}{t}" in hourly_df.columns + + def test_empty_df_for_location_features(self, sample_dataframe_for_location_features): + with pytest.raises(ValueError, match="Empty dataframe provided"): + FUtils.get_location_features(pd.DataFrame()) + + + def test_missing_timestamp_for_location_features( + self, + sample_dataframe_for_location_features, + ): + del sample_dataframe_for_location_features[ + "timestamp" + ] + with pytest.raises(ValueError, match="timestamp column is missing"): + FUtils.get_location_features(sample_dataframe_for_location_features) + + + # For missing 'latitude' column + def test_missing_latitude_for_location_features( + self, sample_dataframe_for_location_features + ): + del sample_dataframe_for_location_features[ + "latitude" + ] # Test for missing 'latitude' + with pytest.raises(ValueError, match="latitude column is missing"): + FUtils.get_location_features(sample_dataframe_for_location_features) + + + def test_missing_longitude_for_location_features( + self, sample_dataframe_for_location_features + ): + del sample_dataframe_for_location_features[ + "longitude" + ] # Test for missing 'longitude' + with pytest.raises(ValueError, match="longitude column is missing"): + FUtils.get_location_features(sample_dataframe_for_location_features) + + + # Test the normal procedure + def test_get_location_features(self, sample_dataframe_for_location_features): + df = FUtils.get_location_features(sample_dataframe_for_location_features) + for cord in ["x_cord", "y_cord", "z_cord"]: + assert cord in df.columns \ No newline at end of file diff --git a/src/airflow/dev-requirements.txt b/src/airflow/dev-requirements.txt index 81c23b0562..d37188bf8d 100644 --- a/src/airflow/dev-requirements.txt +++ b/src/airflow/dev-requirements.txt @@ -3,6 +3,7 @@ apache-airflow-providers-slack confluent-avro google-cloud-bigquery google-cloud-storage +optuna pyarrow sentry-sdk pandas @@ -17,5 +18,4 @@ db_dtypes mlflow lightgbm gcsfs -pymongo pytest \ No newline at end of file From 8a6e8e77f42860ee7446a579aa6cedcf7ad98c64 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Mon, 18 Sep 2023 12:07:49 +0300 Subject: [PATCH 21/43] refactor forecast job DAG --- src/airflow/airqo_etl_utils/ml_utils.py | 1 - src/airflow/dags/ml_prediction_jobs.py | 61 ++++++++++++++++++++----- 2 files changed, 50 insertions(+), 12 deletions(-) diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index 3f541662f7..d026a418c8 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -609,7 +609,6 @@ def get_forecasts( ) forecasts = pd.concat([forecasts, device_forecasts], ignore_index=True) - print(device) forecasts["pm2_5"] = forecasts["pm2_5"].astype(float) # forecasts["margin_of_error"] = forecasts["margin_of_error"].astype(float) diff --git a/src/airflow/dags/ml_prediction_jobs.py b/src/airflow/dags/ml_prediction_jobs.py index 04972fccd6..20cd5593ca 100644 --- a/src/airflow/dags/ml_prediction_jobs.py +++ b/src/airflow/dags/ml_prediction_jobs.py @@ -34,8 +34,18 @@ def preprocess_historical_data_hourly_forecast(data): return ForecastUtils.preprocess_data(data, "hourly") @task - def feature_eng_hourly_historical_data(data): - return ForecastUtils.feature_eng_data(data, "pm2_5", "hourly", "predict") + def generate_lag_and_rolling_features_hourly_forecast(data): + return ForecastUtils.get_lag_and_roll_features(data, "pm2_5", "hourly") + + + @task() + def get_time_and_cyclic_features_hourly_forecast(data): + return ForecastUtils.get_time_and_cyclic_features(data, "hourly") + + + @task() + def get_location_features_hourly_forecast(data): + return ForecastUtils.get_location_features(data) @task() def make_hourly_forecasts(data): @@ -70,12 +80,22 @@ def preprocess_historical_data_daily_forecast(data): return ForecastUtils.preprocess_data(data, "daily") @task() - def feature_engineer_daily_historical_data(data): - return ForecastUtils.feature_eng_data(data, "pm2_5", "daily", "predict") + def generate_lag_and_rolling_features_daily_forecast(data): + return ForecastUtils.get_lag_and_roll_features(data, "pm2_5", "daily") + + @task() + def get_time_and_cyclic_features_daily_forecast(data): + return ForecastUtils.get_time_and_cyclic_features(data, "daily") + + @task() + def get_location_features_daily_forecast(data): + return ForecastUtils.get_location_features(data) @task() def make_daily_forecasts(data): - return ForecastUtils.generate_forecasts(data, project_id, bucket, "daily") + return ForecastUtils.generate_forecasts( + data=data, project_name=project_id, bucket_name=bucket, frequency="daily" + ) @task() def save_daily_forecasts_to_bigquery(data): @@ -87,17 +107,36 @@ def save_daily_forecasts_to_bigquery(data): def save_daily_forecasts_to_mongo(data): ForecastUtils.save_forecasts_to_mongo(data, "daily") + + # Hourly forecast pipeline hourly_data = get_historical_data_for_hourly_forecasts() - preprocessed_hourly_data = preprocess_historical_data_hourly_forecast(hourly_data) - feat_data = feature_eng_hourly_historical_data(preprocessed_hourly_data) - hourly_forecasts = make_hourly_forecasts(feat_data) + hourly_preprocessed_data = preprocess_historical_data_hourly_forecast(hourly_data) + hourly_lag_and_roll_features = generate_lag_and_rolling_features_hourly_forecast( + hourly_preprocessed_data + ) + hourly_time_and_cyclic_features = get_time_and_cyclic_features_hourly_forecast( + hourly_lag_and_roll_features + ) + hourly_location_features = get_location_features_hourly_forecast( + hourly_time_and_cyclic_features + ) + hourly_forecasts = make_hourly_forecasts(hourly_location_features) save_hourly_forecasts_to_bigquery(hourly_forecasts) save_hourly_forecasts_to_mongo(hourly_forecasts) + # Daily forecast pipeline daily_data = get_historical_data_for_daily_forecasts() - preprocessed_daily_data = preprocess_historical_data_daily_forecast(daily_data) - feat_data = feature_engineer_daily_historical_data(preprocessed_daily_data) - daily_forecasts = make_daily_forecasts(feat_data) + daily_preprocessed_data = preprocess_historical_data_daily_forecast(daily_data) + daily_lag_and_roll_features = generate_lag_and_rolling_features_daily_forecast( + daily_preprocessed_data + ) + daily_time_and_cyclic_features = get_time_and_cyclic_features_daily_forecast( + daily_lag_and_roll_features + ) + daily_location_features = get_location_features_daily_forecast( + daily_time_and_cyclic_features + ) + daily_forecasts = make_daily_forecasts(daily_location_features) save_daily_forecasts_to_bigquery(daily_forecasts) save_daily_forecasts_to_mongo(daily_forecasts) From 01a6674aabd40da4342d10d096bc0daed0d12465 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 12:20:18 +0300 Subject: [PATCH 22/43] Update AirQo exceedance production image tag to prod-74273167-1695028772 --- k8s/exceedance/values-prod-airqo.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/exceedance/values-prod-airqo.yaml b/k8s/exceedance/values-prod-airqo.yaml index 835a06a025..8a9e92895e 100644 --- a/k8s/exceedance/values-prod-airqo.yaml +++ b/k8s/exceedance/values-prod-airqo.yaml @@ -4,6 +4,6 @@ app: configmap: env-exceedance-production image: repository: eu.gcr.io/airqo-250220/airqo-exceedance-job - tag: prod-d4165e1e-1695022368 + tag: prod-74273167-1695028772 nameOverride: '' fullnameOverride: '' From 4233c133a606fa3ba2f50c9ef72c7564435ed01a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 12:20:27 +0300 Subject: [PATCH 23/43] Update KCCA exceedance production image tag to prod-74273167-1695028772 --- k8s/exceedance/values-prod-kcca.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/exceedance/values-prod-kcca.yaml b/k8s/exceedance/values-prod-kcca.yaml index d8306e316e..f1b68b74fe 100644 --- a/k8s/exceedance/values-prod-kcca.yaml +++ b/k8s/exceedance/values-prod-kcca.yaml @@ -4,6 +4,6 @@ app: configmap: env-exceedance-production image: repository: eu.gcr.io/airqo-250220/kcca-exceedance-job - tag: prod-d4165e1e-1695022368 + tag: prod-74273167-1695028772 nameOverride: '' fullnameOverride: '' From 2474e9b2cea09c993bd864b9a256ceea55952f5f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 12:21:04 +0300 Subject: [PATCH 24/43] Update incentives production image tag to prod-74273167-1695028772 --- k8s/incentives/values-prod.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/incentives/values-prod.yaml b/k8s/incentives/values-prod.yaml index 91b285ee99..2526ab40a1 100644 --- a/k8s/incentives/values-prod.yaml +++ b/k8s/incentives/values-prod.yaml @@ -6,7 +6,7 @@ app: replicaCount: 3 image: repository: eu.gcr.io/airqo-250220/airqo-incentives-api - tag: prod-d4165e1e-1695022368 + tag: prod-74273167-1695028772 nameOverride: '' fullnameOverride: '' podAnnotations: {} From f669ae119dcba6c4e55d67454673cdbd344e4904 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 12:21:09 +0300 Subject: [PATCH 25/43] Update auth service staging image tag to stage-5513f226-1695028756 --- k8s/auth-service/values-stage.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/auth-service/values-stage.yaml b/k8s/auth-service/values-stage.yaml index 8a1b64d1b8..341ee6b68a 100644 --- a/k8s/auth-service/values-stage.yaml +++ b/k8s/auth-service/values-stage.yaml @@ -6,7 +6,7 @@ app: replicaCount: 2 image: repository: eu.gcr.io/airqo-250220/airqo-stage-auth-api - tag: stage-b17fbb54-1694524327 + tag: stage-5513f226-1695028756 nameOverride: '' fullnameOverride: '' podAnnotations: {} From 0e8d9a7edfcb7b0cf93a41709a0a70b1c1e47b92 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 12:21:57 +0300 Subject: [PATCH 26/43] Update device registry production image tag to prod-74273167-1695028772 --- k8s/device-registry/values-prod.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/device-registry/values-prod.yaml b/k8s/device-registry/values-prod.yaml index ba45a082f1..11dfc9e0c2 100644 --- a/k8s/device-registry/values-prod.yaml +++ b/k8s/device-registry/values-prod.yaml @@ -6,7 +6,7 @@ app: replicaCount: 3 image: repository: eu.gcr.io/airqo-250220/airqo-device-registry-api - tag: prod-80ea615f-1694585638 + tag: prod-74273167-1695028772 nameOverride: '' fullnameOverride: '' podAnnotations: {} From 6d1b58d4c2f1717d55402308ab8f86810ed27f28 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 12:22:09 +0300 Subject: [PATCH 27/43] Update auth service production image tag to prod-74273167-1695028772 --- k8s/auth-service/values-prod.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/auth-service/values-prod.yaml b/k8s/auth-service/values-prod.yaml index 620252ed57..b7b8ea2057 100644 --- a/k8s/auth-service/values-prod.yaml +++ b/k8s/auth-service/values-prod.yaml @@ -6,7 +6,7 @@ app: replicaCount: 3 image: repository: eu.gcr.io/airqo-250220/airqo-auth-api - tag: prod-d4165e1e-1695022368 + tag: prod-74273167-1695028772 nameOverride: '' fullnameOverride: '' podAnnotations: {} From 2a75b4e27a64d954eea242041f24f9ad55f77fee Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 12:22:48 +0300 Subject: [PATCH 28/43] Update analytics production image tag to prod-74273167-1695028772 --- k8s/analytics/values-prod.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/analytics/values-prod.yaml b/k8s/analytics/values-prod.yaml index 56b7b6b3e5..aa95a02534 100644 --- a/k8s/analytics/values-prod.yaml +++ b/k8s/analytics/values-prod.yaml @@ -8,7 +8,7 @@ images: celeryWorker: eu.gcr.io/airqo-250220/airqo-analytics-celery-worker reportJob: eu.gcr.io/airqo-250220/airqo-analytics-report-job devicesSummaryJob: eu.gcr.io/airqo-250220/airqo-analytics-devices-summary-job - tag: prod-d4165e1e-1695022368 + tag: prod-74273167-1695028772 api: name: airqo-analytics-api label: analytics-api From 280ecfb1464eb3b5d40413f73b1fae5abc9657c5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 12:23:04 +0300 Subject: [PATCH 29/43] Update airflow prod image tag to prod-74273167-1695028772 --- k8s/airflow/values-prod.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/airflow/values-prod.yaml b/k8s/airflow/values-prod.yaml index f01e6e714a..1cee1ed482 100644 --- a/k8s/airflow/values-prod.yaml +++ b/k8s/airflow/values-prod.yaml @@ -9,7 +9,7 @@ images: repositories: initContainer: eu.gcr.io/airqo-250220/airqo-apache-airflow-xcom containers: eu.gcr.io/airqo-250220/airqo-apache-airflow - tag: prod-d4165e1e-1695022368 + tag: prod-74273167-1695028772 nameOverride: '' fullnameOverride: '' podAnnotations: {} From 7cc6e2e7f188eeae2a67a96e099d64a04a026ef9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 12:24:48 +0300 Subject: [PATCH 30/43] Update predict production image tag to prod-74273167-1695028772 --- k8s/predict/values-prod.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/predict/values-prod.yaml b/k8s/predict/values-prod.yaml index 3b64007dd0..cfe7b4056c 100644 --- a/k8s/predict/values-prod.yaml +++ b/k8s/predict/values-prod.yaml @@ -7,7 +7,7 @@ images: predictJob: eu.gcr.io/airqo-250220/airqo-predict-job trainJob: eu.gcr.io/airqo-250220/airqo-train-job predictPlaces: eu.gcr.io/airqo-250220/airqo-predict-places-air-quality - tag: prod-d4165e1e-1695022368 + tag: prod-74273167-1695028772 api: name: airqo-prediction-api label: prediction-api From 0f22ce91b0b3186539ec3ff031f73452728ce327 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Mon, 18 Sep 2023 12:34:10 +0300 Subject: [PATCH 31/43] Run black formatter --- src/airflow/airqo_etl_utils/bigquery_api.py | 2 - src/airflow/airqo_etl_utils/ml_utils.py | 39 ++++++++++------- .../tests/big_query_api_tests.py | 8 ++-- src/airflow/airqo_etl_utils/tests/conftest.py | 17 ++++---- .../airqo_etl_utils/tests/ml_utils_tests.py | 42 ++++++++++--------- src/airflow/dags/data_warehouse.py | 5 ++- src/airflow/dags/ml_prediction_jobs.py | 7 +--- src/airflow/dags/ml_training_jobs.py | 12 +++--- 8 files changed, 71 insertions(+), 61 deletions(-) diff --git a/src/airflow/airqo_etl_utils/bigquery_api.py b/src/airflow/airqo_etl_utils/bigquery_api.py index 65317a0e77..2e66b9fc10 100644 --- a/src/airflow/airqo_etl_utils/bigquery_api.py +++ b/src/airflow/airqo_etl_utils/bigquery_api.py @@ -647,8 +647,6 @@ def fetch_data( except Exception as e: print("Error fetching data from bigquery") - - @staticmethod def save_forecasts_to_bigquery(df, table): """saves the dataframes to the bigquery tables""" diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index d026a418c8..811e267515 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -22,7 +22,8 @@ class GCSUtils: - """ Utility class for saving and retrieving models from GCS""" + """Utility class for saving and retrieving models from GCS""" + # TODO: In future, save and retrieve models from mlflow instead of GCS @staticmethod def get_trained_model_from_gcs(project_name, bucket_name, source_blob_name): @@ -67,7 +68,8 @@ def get_mapping_from_gcs(project_name, bucket_name, source_blob_name): class DecodingUtils: - """ Utility class for encoding and decoding categorical features""" + """Utility class for encoding and decoding categorical features""" + @staticmethod def decode_categorical_features_pred(df, frequency): columns = ["device_id", "site_id", "device_category"] @@ -128,7 +130,13 @@ def encode_categorical_training_features(df, freq): class ForecastUtils: @staticmethod def preprocess_data(data, data_frequency): - required_columns = {"device_id", "site_id", "device_category", "pm2_5", "timestamp"} + required_columns = { + "device_id", + "site_id", + "device_category", + "pm2_5", + "timestamp", + } if not required_columns.issubset(data.columns): missing_columns = required_columns.difference(data.columns) raise ValueError( @@ -158,17 +166,20 @@ def preprocess_data(data, data_frequency): data = data.dropna(subset=["pm2_5"]) return data - @staticmethod def get_lag_and_roll_features(df, target_col, freq): if df.empty: raise ValueError("Empty dataframe provided") - if target_col not in df.columns or "timestamp" not in df.columns or "device_id" not in df.columns: + if ( + target_col not in df.columns + or "timestamp" not in df.columns + or "device_id" not in df.columns + ): raise ValueError("Required columns missing") df["timestamp"] = pd.to_datetime(df["timestamp"]) - + df1 = df.copy() # use copy to prevent terminal warning if freq == "daily": shifts = [1, 2, 3, 7] @@ -229,7 +240,7 @@ def get_time_and_cyclic_features(df, freq): df1[a] = df1["timestamp"].dt.__getattribute__(a) df1[a + "_sin"] = np.sin(2 * np.pi * df1[a] / m) df1[a + "_cos"] = np.cos(2 * np.pi * df1[a] / m) - + df1["week"] = df1["timestamp"].dt.isocalendar().week df1["week_sin"] = np.sin(2 * np.pi * df1["week"] / 52) df1["week_cos"] = np.cos(2 * np.pi * df1["week"] / 52) @@ -240,17 +251,17 @@ def get_time_and_cyclic_features(df, freq): def get_location_features(df): if df.empty: raise ValueError("Empty dataframe provided") - + for column_name in ["timestamp", "latitude", "longitude"]: if column_name not in df.columns: raise ValueError(f"{column_name} column is missing") - + df["timestamp"] = pd.to_datetime(df["timestamp"]) - + df["x_cord"] = np.cos(df["latitude"]) * np.cos(df["longitude"]) df["y_cord"] = np.cos(df["latitude"]) * np.sin(df["longitude"]) df["z_cord"] = np.sin(df["latitude"]) - + return df # df_tmp = get_lag_features(df_tmp, target_column, data_frequency) @@ -280,8 +291,8 @@ def train_and_save_forecast_models(training_data, frequency): Perform the actual training for hourly data """ training_data.dropna( - subset=["device_id", "site_id", "device_category"], inplace=True - ) + subset=["device_id", "site_id", "device_category"], inplace=True + ) training_data["device_id"] = training_data["device_id"].astype(int) training_data["site_id"] = training_data["site_id"].astype(int) @@ -583,7 +594,7 @@ def get_forecasts( # + df_tmp.loc[df_tmp.index[-1], "margin_of_error"] # ) - return df_tmp.iloc[-int(horizon):, :] + return df_tmp.iloc[-int(horizon) :, :] forecasts = pd.DataFrame() forecast_model = GCSUtils.get_trained_model_from_gcs( diff --git a/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py b/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py index da4a20e4f4..2be61e9415 100644 --- a/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py +++ b/src/airflow/airqo_etl_utils/tests/big_query_api_tests.py @@ -54,9 +54,7 @@ def fake_query(query, job_config): fake_job = mock.Mock() if "2023-01-01" in query: - fake_job.result.return_value.to_dataframe.return_value = ( - sample_df - ) + fake_job.result.return_value.to_dataframe.return_value = sample_df elif "2023-01-02" in query: fake_job.result.return_value.to_dataframe.return_value = ( fake_data_empty_result @@ -72,6 +70,7 @@ def fake_query(query, job_config): return fake_client + @pytest.mark.parametrize( "start_date_time, expected_df", [ @@ -98,7 +97,6 @@ def fake_query(query, job_config): ], ) def test_fetch_data_correct_se(mock_bigquery_client2, start_date_time, expected_df): - """Tests the fetch_data method for scenarios when correct data is retrieved.""" bq_api = BigQueryApi() @@ -118,6 +116,7 @@ def test_fetch_data_invalid_date(mock_bigquery_client2, start_date_time): with pytest.raises(ValueError): bq_api.fetch_data(start_date_time) + @pytest.mark.parametrize("start_date_time", ["2023-01-03"]) def test_fetch_data_bigquery_error(mock_bigquery_client2, start_date_time): """Tests the fetch_data method for the scenario where a bigquery.GoogleAPIError is raised.""" @@ -130,7 +129,6 @@ def test_fetch_data_bigquery_error(mock_bigquery_client2, start_date_time): bq_api.fetch_data(start_date_time) - def test_fetch_raw_readings_empty(mock_bigquery_client): api = BigQueryApi() api.client = mock_bigquery_client diff --git a/src/airflow/airqo_etl_utils/tests/conftest.py b/src/airflow/airqo_etl_utils/tests/conftest.py index 4f24ccd45e..cdbc784dc7 100644 --- a/src/airflow/airqo_etl_utils/tests/conftest.py +++ b/src/airflow/airqo_etl_utils/tests/conftest.py @@ -11,8 +11,6 @@ def pytest_configure(config): ) - - class ForecastFixtures: @staticmethod @pytest.fixture(scope="session") @@ -28,12 +26,11 @@ def preprocessing_sample_df(): ) return data - @staticmethod @pytest.fixture def feat_eng_sample_df_daily(): data = { - "timestamp": pd.date_range(end = pd.Timestamp.now(), periods=365).tolist(), + "timestamp": pd.date_range(end=pd.Timestamp.now(), periods=365).tolist(), "device_id": ["device1"] * 365, "pm2_5": range(1, 366), } @@ -43,9 +40,11 @@ def feat_eng_sample_df_daily(): @pytest.fixture def feat_eng_sample_df_hourly(): data = { - "timestamp": pd.date_range(end = pd.Timestamp.now(), periods=24*14, freq='H').tolist(), - "device_id": ["device1"] * 24*14, - "pm2_5": range(1, 24*14+1), + "timestamp": pd.date_range( + end=pd.Timestamp.now(), periods=24 * 14, freq="H" + ).tolist(), + "device_id": ["device1"] * 24 * 14, + "pm2_5": range(1, 24 * 14 + 1), } return pd.DataFrame(data) @@ -53,14 +52,14 @@ def feat_eng_sample_df_hourly(): @pytest.fixture def sample_dataframe_for_location_features(): data = { - "timestamp": pd.date_range(end=pd.Timestamp.now(), periods=100) - .tolist(), + "timestamp": pd.date_range(end=pd.Timestamp.now(), periods=100).tolist(), "device_id": ["device1"] * 100, "latitude": np.random.uniform(-90, 90, 100), "longitude": np.random.uniform(-180, 180, 100), } return pd.DataFrame(data) + @pytest.fixture(scope="session") def mongo_fixture(): from airqo_etl_utils.mongo_client import MongoClient diff --git a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py index ec7128d9e6..f07f58c908 100644 --- a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py +++ b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py @@ -66,44 +66,50 @@ def test_empty_df_for_time_and_cyclic_features(self): with pytest.raises(ValueError, match="Empty dataframe provided"): FUtils.get_time_and_cyclic_features(pd.DataFrame(), "daily") - def test_missing_columns_for_time_and_cyclic_features(self, feat_eng_sample_df_daily): + def test_missing_columns_for_time_and_cyclic_features( + self, feat_eng_sample_df_daily + ): with pytest.raises(ValueError, match="Required columns missing"): FUtils.get_time_and_cyclic_features(feat_eng_sample_df_daily, "daily") - def test_invalid_frequency_for_time_and_cyclic_features(self, feat_eng_sample_df_daily): + def test_invalid_frequency_for_time_and_cyclic_features( + self, feat_eng_sample_df_daily + ): with pytest.raises(ValueError, match="Invalid frequency"): FUtils.get_time_and_cyclic_features(feat_eng_sample_df_daily, "annually") -# For 'daily' frequency + # For 'daily' frequency def test_daily_freq_for_time_and_cyclic_features(self, feat_eng_sample_df_daily): - daily_df = FUtils.get_time_and_cyclic_features(feat_eng_sample_df_daily, "daily") + daily_df = FUtils.get_time_and_cyclic_features( + feat_eng_sample_df_daily, "daily" + ) for a in ["year", "month", "day", "dayofweek", "week"]: for t in ["_sin", "_cos"]: assert f"{a}{t}" in daily_df.columns -# For 'hourly' frequency + # For 'hourly' frequency def test_hourly_freq_for_time_and_cyclic_features(self, feat_eng_sample_df_hourly): - hourly_df = FUtils.get_time_and_cyclic_features(feat_eng_sample_df_hourly, "hourly") + hourly_df = FUtils.get_time_and_cyclic_features( + feat_eng_sample_df_hourly, "hourly" + ) for a in ["year", "month", "day", "dayofweek", "hour", "week"]: for t in ["_sin", "_cos"]: assert f"{a}{t}" in hourly_df.columns - def test_empty_df_for_location_features(self, sample_dataframe_for_location_features): + def test_empty_df_for_location_features( + self, sample_dataframe_for_location_features + ): with pytest.raises(ValueError, match="Empty dataframe provided"): FUtils.get_location_features(pd.DataFrame()) - - + def test_missing_timestamp_for_location_features( self, sample_dataframe_for_location_features, ): - del sample_dataframe_for_location_features[ - "timestamp" - ] + del sample_dataframe_for_location_features["timestamp"] with pytest.raises(ValueError, match="timestamp column is missing"): FUtils.get_location_features(sample_dataframe_for_location_features) - - + # For missing 'latitude' column def test_missing_latitude_for_location_features( self, sample_dataframe_for_location_features @@ -113,8 +119,7 @@ def test_missing_latitude_for_location_features( ] # Test for missing 'latitude' with pytest.raises(ValueError, match="latitude column is missing"): FUtils.get_location_features(sample_dataframe_for_location_features) - - + def test_missing_longitude_for_location_features( self, sample_dataframe_for_location_features ): @@ -123,10 +128,9 @@ def test_missing_longitude_for_location_features( ] # Test for missing 'longitude' with pytest.raises(ValueError, match="longitude column is missing"): FUtils.get_location_features(sample_dataframe_for_location_features) - - + # Test the normal procedure def test_get_location_features(self, sample_dataframe_for_location_features): df = FUtils.get_location_features(sample_dataframe_for_location_features) for cord in ["x_cord", "y_cord", "z_cord"]: - assert cord in df.columns \ No newline at end of file + assert cord in df.columns diff --git a/src/airflow/dags/data_warehouse.py b/src/airflow/dags/data_warehouse.py index af9c20a70b..09316c73e1 100644 --- a/src/airflow/dags/data_warehouse.py +++ b/src/airflow/dags/data_warehouse.py @@ -142,7 +142,6 @@ def load(data: pd.DataFrame): load(clean_consolidated_data) - @dag( "Historical-Consolidated-Data-ETL", schedule=None, @@ -185,7 +184,7 @@ def extract_hourly_weather_data(**kwargs): from airqo_etl_utils.date import DateUtils start_date_time, end_date_time = DateUtils.get_dag_date_time_values( - historical=True, **kwargs + historical=True, **kwargs ) return DataWarehouseUtils.extract_hourly_weather_data( @@ -238,6 +237,7 @@ def load(data: pd.DataFrame): ) load(merged_data) + @dag( "Historical-Cleanup-Consolidated-Data", schedule=None, @@ -280,6 +280,7 @@ def load(data: pd.DataFrame): clean_consolidated_data = remove_duplicates(consolidated_data) load(clean_consolidated_data) + data_warehouse_consolidated_data() data_warehouse_cleanup_consolidated_data() data_warehouse_historical_consolidated_data() diff --git a/src/airflow/dags/ml_prediction_jobs.py b/src/airflow/dags/ml_prediction_jobs.py index 20cd5593ca..9dd0be4ec1 100644 --- a/src/airflow/dags/ml_prediction_jobs.py +++ b/src/airflow/dags/ml_prediction_jobs.py @@ -36,13 +36,11 @@ def preprocess_historical_data_hourly_forecast(data): @task def generate_lag_and_rolling_features_hourly_forecast(data): return ForecastUtils.get_lag_and_roll_features(data, "pm2_5", "hourly") - - + @task() def get_time_and_cyclic_features_hourly_forecast(data): return ForecastUtils.get_time_and_cyclic_features(data, "hourly") - - + @task() def get_location_features_hourly_forecast(data): return ForecastUtils.get_location_features(data) @@ -107,7 +105,6 @@ def save_daily_forecasts_to_bigquery(data): def save_daily_forecasts_to_mongo(data): ForecastUtils.save_forecasts_to_mongo(data, "daily") - # Hourly forecast pipeline hourly_data = get_historical_data_for_hourly_forecasts() hourly_preprocessed_data = preprocess_historical_data_hourly_forecast(hourly_data) diff --git a/src/airflow/dags/ml_training_jobs.py b/src/airflow/dags/ml_training_jobs.py index e5822d12cc..40c563e86e 100644 --- a/src/airflow/dags/ml_training_jobs.py +++ b/src/airflow/dags/ml_training_jobs.py @@ -34,11 +34,11 @@ def preprocess_training_data_for_hourly_forecast_model(data): @task() def get_hourly_lag_and_rolling_features(data): - return ForecastUtils.get_lag_and_roll_features(data, 'pm2_5', 'hourly') + return ForecastUtils.get_lag_and_roll_features(data, "pm2_5", "hourly") @task() def get_hourly_time_and_cyclic_features(data): - return ForecastUtils.get_time_and_cyclic_features(data, 'hourly') + return ForecastUtils.get_time_and_cyclic_features(data, "hourly") @task() def get_location_features(data): @@ -46,7 +46,8 @@ def get_location_features(data): @task() def encode_categorical_features(data): - return DecodingUtils.encode_categorical_training_features(data, 'daily') + return DecodingUtils.encode_categorical_training_features(data, "daily") + @task() def train_and_save_hourly_forecast_model(train_data): return ForecastUtils.train_and_save_forecast_models( @@ -76,7 +77,7 @@ def get_daily_lag_and_rolling_features(data): @task() def get_daily_time_and_cylic_features(data): - return ForecastUtils.get_time_and_cyclic_features(data, 'daily') + return ForecastUtils.get_time_and_cyclic_features(data, "daily") @task() def get_location_features(data): @@ -84,7 +85,8 @@ def get_location_features(data): @task() def encode_categorical_features(data): - return DecodingUtils.encode_categorical_training_features(data, 'daily') + return DecodingUtils.encode_categorical_training_features(data, "daily") + @task() def train_and_save_daily_model(train_data): return ForecastUtils.train_and_save_forecast_models(train_data, "daily") From 75be7a5a7385ea5e5dd5f080085209905e3d57a6 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Mon, 18 Sep 2023 13:05:11 +0300 Subject: [PATCH 32/43] Update ml_prediction_jobs.py --- src/airflow/dags/ml_prediction_jobs.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/airflow/dags/ml_prediction_jobs.py b/src/airflow/dags/ml_prediction_jobs.py index 9dd0be4ec1..e602a0e3c1 100644 --- a/src/airflow/dags/ml_prediction_jobs.py +++ b/src/airflow/dags/ml_prediction_jobs.py @@ -3,7 +3,7 @@ from airqo_etl_utils.airflow_custom_utils import AirflowUtils from airqo_etl_utils.bigquery_api import BigQueryApi from airqo_etl_utils.config import configuration -from airqo_etl_utils.ml_utils import ForecastUtils +from airqo_etl_utils.ml_utils import ForecastUtils, DecodingUtils @dag( @@ -45,6 +45,10 @@ def get_time_and_cyclic_features_hourly_forecast(data): def get_location_features_hourly_forecast(data): return ForecastUtils.get_location_features(data) + @task() + def encode_hourly_categorical_features(data): + return DecodingUtils.decode_categorical_features_pred(data, "hourly") + @task() def make_hourly_forecasts(data): return ForecastUtils.generate_forecasts( @@ -90,6 +94,9 @@ def get_location_features_daily_forecast(data): return ForecastUtils.get_location_features(data) @task() + def encode_daily_categorical_features(data): + return DecodingUtils.decode_categorical_features_pred(data, "daily") + @task() def make_daily_forecasts(data): return ForecastUtils.generate_forecasts( data=data, project_name=project_id, bucket_name=bucket, frequency="daily" @@ -117,7 +124,10 @@ def save_daily_forecasts_to_mongo(data): hourly_location_features = get_location_features_hourly_forecast( hourly_time_and_cyclic_features ) - hourly_forecasts = make_hourly_forecasts(hourly_location_features) + hourly_encoded_features = encode_hourly_categorical_features( + hourly_location_features + ) + hourly_forecasts = make_hourly_forecasts(hourly_encoded_features) save_hourly_forecasts_to_bigquery(hourly_forecasts) save_hourly_forecasts_to_mongo(hourly_forecasts) @@ -133,7 +143,10 @@ def save_daily_forecasts_to_mongo(data): daily_location_features = get_location_features_daily_forecast( daily_time_and_cyclic_features ) - daily_forecasts = make_daily_forecasts(daily_location_features) + daily_encoded_features = encode_daily_categorical_features( + daily_location_features + ) + daily_forecasts = make_daily_forecasts(daily_encoded_features) save_daily_forecasts_to_bigquery(daily_forecasts) save_daily_forecasts_to_mongo(daily_forecasts) From ccb9ee22e060b1eadae3003b3bb9bec114ebc701 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 15:14:07 +0300 Subject: [PATCH 33/43] Update airflow staging image tag to stage-defae719-1695039035 --- k8s/airflow/values-stage.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/airflow/values-stage.yaml b/k8s/airflow/values-stage.yaml index 20127f0f6d..0f8ec9a229 100644 --- a/k8s/airflow/values-stage.yaml +++ b/k8s/airflow/values-stage.yaml @@ -9,7 +9,7 @@ images: repositories: initContainer: eu.gcr.io/airqo-250220/airqo-stage-apache-airflow-xcom containers: eu.gcr.io/airqo-250220/airqo-stage-apache-airflow - tag: stage-d1aaf3c2-1694766672 + tag: stage-defae719-1695039035 nameOverride: '' fullnameOverride: '' podAnnotations: {} From 761aad8cd9f36675f52af9a8517ba5ae1efce1a8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 15:15:07 +0300 Subject: [PATCH 34/43] Update predict staging image tag to stage-defae719-1695039035 --- k8s/predict/values-stage.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/predict/values-stage.yaml b/k8s/predict/values-stage.yaml index 79f9f7908d..0c1220227a 100644 --- a/k8s/predict/values-stage.yaml +++ b/k8s/predict/values-stage.yaml @@ -7,7 +7,7 @@ images: predictJob: eu.gcr.io/airqo-250220/stage-airqo-predict-job trainJob: eu.gcr.io/airqo-250220/stage-airqo-train-job predictPlaces: eu.gcr.io/airqo-250220/stage-airqo-predict-places-air-quality - tag: stage-84518356-1693167908 + tag: stage-defae719-1695039035 api: name: airqo-stage-prediction-api label: prediction-api From e010fe97e50771d0de11b1abf699a39c4e43625b Mon Sep 17 00:00:00 2001 From: Benjamin Ssempala <86492979+BenjaminSsempala@users.noreply.github.com> Date: Tue, 19 Sep 2023 14:28:28 +0300 Subject: [PATCH 35/43] Add Translation for Lessons and Quizzes --- src/device-registry/routes/v2/kya.js | 40 +++ src/device-registry/utils/create-event.js | 2 +- .../utils/create-health-tips.js | 2 +- .../utils/create-know-your-air.js | 15 ++ .../utils/test/ut_create-know-your-air.js | 43 ++++ .../utils/test/ut_translate.js | 229 +++++++++++++++--- src/device-registry/utils/translate.js | 97 +++++++- 7 files changed, 386 insertions(+), 42 deletions(-) diff --git a/src/device-registry/routes/v2/kya.js b/src/device-registry/routes/v2/kya.js index b1158a28eb..5cfff363cc 100644 --- a/src/device-registry/routes/v2/kya.js +++ b/src/device-registry/routes/v2/kya.js @@ -79,6 +79,16 @@ router.get( }), ], ]), + oneOf([ + [ + query("language") + .optional() + .notEmpty() + .withMessage("the language cannot be empty when provided") + .bail() + .trim() + ], + ]), knowYourAirController.listLessons ); @@ -97,6 +107,16 @@ router.get( .withMessage("the tenant value is not among the expected ones"), ], ]), + oneOf([ + [ + query("language") + .optional() + .notEmpty() + .withMessage("the language cannot be empty when provided") + .bail() + .trim() + ], + ]), oneOf([ [ @@ -1183,6 +1203,16 @@ router.get( }), ], ]), + oneOf([ + [ + query("language") + .optional() + .notEmpty() + .withMessage("the language cannot be empty when provided") + .bail() + .trim() + ], + ]), knowYourAirController.listQuizzes ); @@ -1201,6 +1231,16 @@ router.get( .withMessage("the tenant value is not among the expected ones"), ], ]), + oneOf([ + [ + query("language") + .optional() + .notEmpty() + .withMessage("the language cannot be empty when provided") + .bail() + .trim() + ], + ]), oneOf([ [ diff --git a/src/device-registry/utils/create-event.js b/src/device-registry/utils/create-event.js index 48d8144797..26b76022e1 100644 --- a/src/device-registry/utils/create-event.js +++ b/src/device-registry/utils/create-event.js @@ -595,7 +595,7 @@ const createEvent = { if (language !== undefined && constants.ENVIRONMENT === "STAGING ENVIRONMENT") { let data = responseFromListEvents.data[0].data; for (const event of data) { - let translatedHealthTips = await translateUtil.translate(event.health_tips, language); + let translatedHealthTips = await translateUtil.translateTips(event.health_tips, language); if (translatedHealthTips.success === true) { event.health_tips = translatedHealthTips.data; } diff --git a/src/device-registry/utils/create-health-tips.js b/src/device-registry/utils/create-health-tips.js index 87a198b98b..fdac17a412 100644 --- a/src/device-registry/utils/create-health-tips.js +++ b/src/device-registry/utils/create-health-tips.js @@ -34,7 +34,7 @@ const createHealthTips = { skip, }); if (language !== undefined) { - translatedHealthTips = await translateUtil.translate(responseFromListHealthTips.data, language); + translatedHealthTips = await translateUtil.translateTips(responseFromListHealthTips.data, language); responseFromListHealthTips = translatedHealthTips; } diff --git a/src/device-registry/utils/create-know-your-air.js b/src/device-registry/utils/create-know-your-air.js index 10a02af088..5d8a821862 100644 --- a/src/device-registry/utils/create-know-your-air.js +++ b/src/device-registry/utils/create-know-your-air.js @@ -13,6 +13,7 @@ const { logObject, logElement, logText } = require("./log"); const generateFilter = require("./generate-filter"); const log4js = require("log4js"); const logger = log4js.getLogger(`${constants.ENVIRONMENT} -- create-kya-util`); +const translateUtil = require("./translate"); const mongoose = require("mongoose").set("debug", true); const ObjectId = mongoose.Types.ObjectId; @@ -44,6 +45,7 @@ const createKnowYourAir = { const { user_id } = request.params; const limit = parseInt(request.query.limit, 0); const skip = parseInt(request.query.skip, 0); + const language = request.query.language; const filter = generateFilter.kyalessons(request); if (filter.success && filter.success === false) { return filter; @@ -57,6 +59,12 @@ const createKnowYourAir = { user_id: user_id, } ); + if (language !== undefined) { + const translatedLessons = await translateUtil.translateLessons(responseFromListLessons.data, language); + if (translatedLessons.success === true) { + return translatedLessons; + } + } logObject("responseFromListLessons", responseFromListLessons); return responseFromListLessons; } catch (error) { @@ -938,6 +946,7 @@ const createKnowYourAir = { const { user_id } = request.params; const limit = parseInt(request.query.limit, 0); const skip = parseInt(request.query.skip, 0); + const language = request.query.language; const filter = generateFilter.kyaquizzes(request); if (filter.success && filter.success === false) { return filter; @@ -949,6 +958,12 @@ const createKnowYourAir = { skip, user_id: user_id, }); + if (language !== undefined) { + const translatedQuizzes = await translateUtil.translateQuizzes(responseFromListQuizzes.data, language); + if (translatedQuizzes.success === true) { + return translatedQuizzes; + } + } logObject("responseFromListQuizzes", responseFromListQuizzes); return responseFromListQuizzes; } catch (error) { diff --git a/src/device-registry/utils/test/ut_create-know-your-air.js b/src/device-registry/utils/test/ut_create-know-your-air.js index fcb5499776..2c934b1d44 100644 --- a/src/device-registry/utils/test/ut_create-know-your-air.js +++ b/src/device-registry/utils/test/ut_create-know-your-air.js @@ -42,6 +42,28 @@ describe("createKnowYourAir Utility Functions", () => { listStub.restore(); }); + it("should return a list of translated lessons successfully", async () => { + const request = { + query: { tenant: "your-tenant" }, + params: { user_id: "user-id" }, + query: { limit: 10, skip: 0, language: "fr" }, + }; + + // Stub KnowYourAirLessonModel.list + const listStub = sinon + .stub(KnowYourAirLessonModel("your-tenant"), "list") + .resolves({ success: true, data: [], status: httpStatus.OK }); + + const result = await createKnowYourAir.listLesson(request); + + expect(result.success).to.be.true; + expect(result.data).to.deep.equal([]); + expect(result.status).to.equal(httpStatus.OK); + + // Restore the stub + listStub.restore(); + }); + it("should handle filter failure", async () => { const request = { query: { tenant: "your-tenant" }, @@ -1639,6 +1661,27 @@ describe("createKnowYourAir Utility Functions", () => { KnowYourAirQuizModel("your-tenant").list.restore(); }); + it("should list translated quizzes", async () => { + const request = { + query: { tenant: "your-tenant" }, + params: { user_id: "user-id" }, + query: { limit: 10, skip: 0, language: "fr" }, + }; + + // Stub KnowYourAirQuizModel(tenant).list to return quiz data + const quizListStub = sinon + .stub(KnowYourAirQuizModel("your-tenant"), "list") + .resolves({ success: true /* other response properties */ }); + + const result = await createKnowYourAir.listQuiz(request); + + expect(result.success).to.be.true; + // Your other assertions here + + // Restore the stub + KnowYourAirQuizModel("your-tenant").list.restore(); + }); + it("should handle filter failure", async () => { const request = { query: { tenant: "your-tenant" }, diff --git a/src/device-registry/utils/test/ut_translate.js b/src/device-registry/utils/test/ut_translate.js index 665dff3b4f..3c990a4125 100644 --- a/src/device-registry/utils/test/ut_translate.js +++ b/src/device-registry/utils/test/ut_translate.js @@ -9,45 +9,195 @@ const httpStatus = require("http-status"); const translateUtil = require("@utils/translate"); describe('translateUtil', () => { - it('should translate health tips to the target language', async () => { - const healthTips = [ - { - title: 'Hello', - description: 'World', - }, - { - title: 'Good', - description: 'Morning', - }, - ]; - const targetLanguage = 'fr'; - - const expectedTranslations = [ - { - title: 'Bonjour', - description: 'Monde', - }, - { - title: 'Bien', - description: 'Matin', - }, - ]; - - const result = await translateUtil.translate(healthTips, targetLanguage); - - - expect(result).to.have.property('success', true); - for (let i = 0; i < result.data.length; i++) { - expect(result.data[i].title).to.equal(expectedTranslations[i].title); - expect(result.data[i].description).to.equal(expectedTranslations[i].description); - } - }).timeout(10000); - - it('should handle translation errors gracefully', async () => { - - const healthTips = null; - const targetLanguage = 'fr'; - const result = await translateUtil.translate(healthTips, targetLanguage); + describe("translateTips", () => { + it('should translate health tips to the target language', async () => { + const healthTips = [ + { + title: 'Hello', + description: 'World', + }, + { + title: 'Good', + description: 'Morning', + }, + ]; + const targetLanguage = 'fr'; + + const expectedTranslations = [ + { + title: 'Bonjour', + description: 'Monde', + }, + { + title: 'Bien', + description: 'Matin', + }, + ]; + + const result = await translateUtil.translateTips(healthTips, targetLanguage); + + + expect(result).to.have.property('success', true); + for (let i = 0; i < result.data.length; i++) { + expect(result.data[i].title).to.equal(expectedTranslations[i].title); + expect(result.data[i].description).to.equal(expectedTranslations[i].description); + } + }).timeout(10000); + + it('should handle translation errors gracefully', async () => { + + const healthTips = null; + const targetLanguage = 'fr'; + const result = await translateUtil.translateTips(healthTips, targetLanguage); + + expect(result).to.have.property('success', false); + expect(result).to.have.property('message', 'Internal Server Error'); + expect(result).to.have.property('status', 500); + expect(result).to.have.property('errors'); + expect(result.errors).to.have.property('message'); + }); + }) + + describe("translateLessons", () => { + it('should translate Kya lessons to the target language', async () => { + const kyaLessons = [ + { + "_id": "testId", + "title": "Actions you can take to reduce air pollution", + "completion_message": "You just finished your first Know Your Air Lesson", + "image": "https://testimage", + "tasks": [ + { + "_id": "testId", + "title": "Use public transport", + "content": "Vehicle exhaust is a major source of air pollution. Less cars on the road results in less emissions.", + "image": "https://testimage", + "task_position": 2 + }, + ] + } + ]; + const targetLanguage = 'fr'; + + const expectedTranslations = [ + { + "_id": "testId", + "title": "Mesures que vous pouvez prendre pour réduire la pollution de l’air", + "completion_message": "Vous venez de terminer votre première leçon Know Your Air.", + "image": "https://testimage", + "tasks": [ + { + "_id": "testId", + "title": "Utilisez les transports en commun", + "content": "Les gaz d’échappement des véhicules constituent une source majeure de pollution atmosphérique. Moins de voitures sur la route entraîne moins d’émissions.", + "image": "https://testimage", + "task_position": 2 + }, + ] + } + ]; + + const result = await translateUtil.translateLessons(kyaLessons, targetLanguage); + + + expect(result).to.have.property('success', true); + for (let i = 0; i < result.data.length; i++) { + expect(result.data[i].title).to.equal(expectedTranslations[i].title); + expect(result.data[i].completion_message).to.equal(expectedTranslations[i].completion_message); + expect(result.data[i].tasks).to.deep.equal(expectedTranslations[i].tasks); + } + }).timeout(10000); + + it('should handle translation errors gracefully', async () => { + + const lessons = null; + const targetLanguage = 'fr'; + const result = await translateUtil.translateLessons(lessons, targetLanguage); + + expect(result).to.have.property('success', false); + expect(result).to.have.property('message', 'Internal Server Error'); + expect(result).to.have.property('status', 500); + expect(result).to.have.property('errors'); + expect(result.errors).to.have.property('message'); + }); + }); + describe("translateQuizzes", () => { + it('should translate Kya Quizzes to the target language', async () => { + const kyaQuizzes = [ + { + "_id": "testId", + "title": "Get personalised air quality recommendations", + "description": "Tell us more about Air Quality conditions in your environment & get personalised tips.", + "completion_message": "Way to go🎊. You have unlocked personalised air quality recommendations to empower you on your clean air journey.", + "image": "https//testImage", + "questions": [ + { + "title": "Where is your home environment situated?", + "context": "Home environment", + "question_position": 1, + "answers": [ + { + "content": [ + "Cooking with firewood can emit significant amounts of air pollutants.", + "Cook in a well-ventilated kitchen with good airflow or set up an outdoor kitchen if possible.", + "Use an efficient stove designed to burn firewood more cleanly and with less smoke.", + "Consider switching to improved cookstoves that reduce emissions and increase fuel efficiency." + ], + "title": "Firewood", + } + ] + }, + ], + }, + ]; + + const targetLanguage = 'fr'; + + const expectedTranslations = [ + { + "_id": "testId", + "title": "Obtenez des recommandations personnalisées sur la qualité de l'air", + "description": "Dites-nous en plus sur les conditions de qualité de l'air dans votre environnement et obtenez des conseils personnalisés.", + "completion_message": "Bravo🎊. Vous avez débloqué des recommandations personnalisées sur la qualité de l'air pour vous aider dans votre voyage vers un air pur.", + "image": "https//testImage", + "questions": [ + { + "title": "Où se situe votre environnement domestique ?", + "context": "Environnement de la maison", + "question_position": 1, + "answers": [ + { + "content": [ + "Cuisiner avec du bois de chauffage peut émettre des quantités importantes de polluants atmosphériques.", + "Cuisinez dans une cuisine bien ventilée avec une bonne circulation d’air ou installez une cuisine extérieure si possible.", + "Utilisez un poêle efficace conçu pour brûler du bois de chauffage plus proprement et avec moins de fumée.", + "Envisagez de passer à des cuisinières améliorées qui réduisent les émissions et augmentent le rendement énergétique." + ], + "title": "Bois de chauffage", + } + ] + }, + ], + }, + ]; + + const result = await translateUtil.translateQuizzes(kyaQuizzes, targetLanguage); + + + expect(result).to.have.property('success', true); + for (let i = 0; i < result.data.length; i++) { + expect(result.data[i].title).to.equal(expectedTranslations[i].title); + expect(result.data[i].completion_message).to.equal(expectedTranslations[i].completion_message); + expect(result.data[i].questions).to.deep.equal(expectedTranslations[i].questions); + expect(result.data[i].questions.answers).to.deep.equal(expectedTranslations[i].questions.answers); + } + }).timeout(10000); + + it('should handle translation errors gracefully', async () => { + + const kyaQuizzes = null; + const targetLanguage = 'fr'; + const result = await translateUtil.translateQuizzes(kyaQuizzes, targetLanguage); expect(result).to.have.property('success', false); expect(result).to.have.property('message', 'Internal Server Error'); @@ -55,4 +205,5 @@ describe('translateUtil', () => { expect(result).to.have.property('errors'); expect(result.errors).to.have.property('message'); }); +}); }); \ No newline at end of file diff --git a/src/device-registry/utils/translate.js b/src/device-registry/utils/translate.js index 9f885e78c5..eba61b275e 100644 --- a/src/device-registry/utils/translate.js +++ b/src/device-registry/utils/translate.js @@ -9,7 +9,7 @@ const { Translate } = require('@google-cloud/translate').v2; const translate = new Translate(); const translateUtil = { - translate: async (healthTips, targetLanguage) => { + translateTips: async (healthTips, targetLanguage) => { try { const translatedHealthTips = []; @@ -39,6 +39,101 @@ const translateUtil = { }; } }, + + translateLessons: async (lessons, targetLanguage) => { + try { + const translatedLessons = []; + + for (const lesson of lessons) { + const translatedLesson = { ...lesson }; + translatedLesson.title = await translateText(lesson.title, targetLanguage); + translatedLesson.completion_message = await translateText(lesson.completion_message, targetLanguage); + const translatedTasks = []; + for (const task of lesson.tasks) { + const translatedTask = { ...task }; + translatedTask.title = await translateText(task.title, targetLanguage); + translatedTask.content = await translateText(task.content, targetLanguage); + translatedTasks.push(translatedTask); + } + translatedLesson.tasks = translatedTasks + translatedLessons.push(translatedLesson); + } + + return { + success: true, + message: "Translated KYA returned Successfully", + data: translatedLessons, + status: httpStatus.OK, + }; + } catch (error) { + logger.error(`internal server error -- ${error.message}`); + console.log(`internal server error -- ${error.message}`); + + return { + success: false, + message: "Internal Server Error", + status: httpStatus.INTERNAL_SERVER_ERROR, + errors: { + message: error.message, + }, + }; + } + }, + + translateQuizzes: async (quizzes, targetLanguage) => { + try { + const translatedQuizzes = []; + + for (const quiz of quizzes) { + const translatedQuiz = { ...quiz }; + translatedQuiz.title = await translateText(quiz.title, targetLanguage); + translatedQuiz.description = await translateText(quiz.description, targetLanguage); + translatedQuiz.completion_message = await translateText(quiz.completion_message, targetLanguage); + const translatedQuestions = []; + for (const question of quiz.questions) { + const translatedQuestion = { ...question }; + translatedQuestion.title = await translateText(question.title, targetLanguage); + translatedQuestion.context = await translateText(question.context, targetLanguage); + const translatedAnswers = []; + for (const answer of question.answers) { + const translatedAnswer = { ...answer }; + translatedAnswer.title = await translateText(answer.title, targetLanguage); + const translatedContent = []; + for (const contentItem of answer.content) { + const translatedItem = await translateText(contentItem, targetLanguage); + translatedContent.push(translatedItem); + } + translatedAnswer.content = translatedContent; + + translatedAnswers.push(translatedAnswer); + } + translatedQuestion.answers = translatedAnswers; + translatedQuestions.push(translatedQuestion); + } + translatedQuiz.questions = translatedQuestions + translatedQuizzes.push(translatedQuiz); + } + + return { + success: true, + message: "Translated KYA returned Successfully", + data: translatedQuizzes, + status: httpStatus.OK, + }; + } catch (error) { + logger.error(`internal server error -- ${error.message}`); + return { + success: false, + message: "Internal Server Error", + status: httpStatus.INTERNAL_SERVER_ERROR, + errors: { + message: error.message, + }, + }; + } + }, + + }; async function translateText(text, target) { From 69e96c86e7dd6f200044304054674d42d2090ce7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 19 Sep 2023 16:20:45 +0300 Subject: [PATCH 36/43] Update device registry staging image tag to stage-33cbc445-1695129549 --- k8s/device-registry/values-stage.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/device-registry/values-stage.yaml b/k8s/device-registry/values-stage.yaml index a6e50f65cd..3af90282ae 100644 --- a/k8s/device-registry/values-stage.yaml +++ b/k8s/device-registry/values-stage.yaml @@ -6,7 +6,7 @@ app: replicaCount: 2 image: repository: eu.gcr.io/airqo-250220/airqo-stage-device-registry-api - tag: stage-5e65174e-1695028544 + tag: stage-33cbc445-1695129549 nameOverride: '' fullnameOverride: '' podAnnotations: {} From fba683db9e069b9d513e6fb743d3d0bd99366e53 Mon Sep 17 00:00:00 2001 From: Mutabazi Noble <60974514+Mnoble-19@users.noreply.github.com> Date: Wed, 20 Sep 2023 03:43:51 +0300 Subject: [PATCH 37/43] Fix save_to_mongo method --- src/airflow/airqo_etl_utils/config.py | 4 ++ src/airflow/airqo_etl_utils/ml_utils.py | 49 ++++++++++++------- src/airflow/airqo_etl_utils/tests/conftest.py | 43 ++++++++++++++-- .../airqo_etl_utils/tests/ml_utils_tests.py | 23 +++++++++ src/airflow/dags/ml_prediction_jobs.py | 5 +- 5 files changed, 98 insertions(+), 26 deletions(-) diff --git a/src/airflow/airqo_etl_utils/config.py b/src/airflow/airqo_etl_utils/config.py index 9176da5f2e..3f2768bf8a 100644 --- a/src/airflow/airqo_etl_utils/config.py +++ b/src/airflow/airqo_etl_utils/config.py @@ -1,6 +1,7 @@ import os from pathlib import Path +import pymongo as pm import urllib3 from dotenv import load_dotenv @@ -174,3 +175,6 @@ class Config: configuration = Config() + +client = pm.MongoClient(configuration.MONGO_URI) +db = client[configuration.MONGO_DATABASE_NAME] diff --git a/src/airflow/airqo_etl_utils/ml_utils.py b/src/airflow/airqo_etl_utils/ml_utils.py index 811e267515..53bf7af901 100644 --- a/src/airflow/airqo_etl_utils/ml_utils.py +++ b/src/airflow/airqo_etl_utils/ml_utils.py @@ -8,11 +8,10 @@ import numpy as np import optuna import pandas as pd -import pymongo as pm from lightgbm import LGBMRegressor, early_stopping from sklearn.metrics import mean_squared_error -from .config import configuration +from .config import configuration, db project_id = configuration.GOOGLE_CLOUD_PROJECT_ID bucket = configuration.FORECAST_MODELS_BUCKET @@ -493,6 +492,7 @@ def objective(trial): @staticmethod def generate_forecasts(data, project_name, bucket_name, frequency): + data = data.dropna(subset=["device_id"]) data["timestamp"] = pd.to_datetime(data["timestamp"]) data.columns = data.columns.str.strip() # data["margin_of_error"] = data["adjusted_forecast"] = 0 @@ -641,25 +641,36 @@ def get_forecasts( def save_forecasts_to_mongo(data, frequency): device_ids = data["device_id"].unique() created_at = pd.to_datetime(datetime.now()).isoformat() - forecast_results = [ - { - field: data[data["device_id"] == i][field].tolist()[0] - if field not in ["pm2_5", "timestamp"] - else data[data["device_id"] == i][field].tolist() - for field in data.columns + + forecast_results = [] + for i in device_ids: + doc = { + "device_id": i, + "created_at": created_at, + "pm2_5": data[data["device_id"] == i]["pm2_5"].tolist(), + "timestamp": data[data["device_id"] == i]["timestamp"].tolist(), } - | {"created_at": created_at} - for i in device_ids - ] - client = pm.MongoClient(configuration.MONGO_URI) - db = client[configuration.MONGO_DATABASE_NAME] + forecast_results.append(doc) + if frequency == "hourly": - db.hourly_forecasts.delete_many({}) - db.hourly_forecasts.insert_many(forecast_results) - print(db.hourly_forecasts.find_one()) # confirm saving has worked + collection = db.hourly_forecasts elif frequency == "daily": - db.daily_forecasts.delete_many({}) - db.daily_forecasts.insert_many(forecast_results) - print(db.daily_forecasts_1.find_one()) + collection = db.daily_forecasts else: raise ValueError("Invalid frequency argument") + + for doc in forecast_results: + try: + filter_query = {"device_id": doc["device_id"]} + update_query = { + "$set": { + "pm2_5": doc["pm2_5"], + "timestamp": doc["timestamp"], + "created_at": doc["created_at"], + } + } + collection.update_one(filter_query, update_query, upsert=True) + except Exception as e: + print( + f"Failed to update forecast for device {doc['device_id']}: {str(e)}" + ) diff --git a/src/airflow/airqo_etl_utils/tests/conftest.py b/src/airflow/airqo_etl_utils/tests/conftest.py index cdbc784dc7..cfb5b15bb5 100644 --- a/src/airflow/airqo_etl_utils/tests/conftest.py +++ b/src/airflow/airqo_etl_utils/tests/conftest.py @@ -1,9 +1,12 @@ from datetime import datetime +from unittest.mock import MagicMock import numpy as np import pandas as pd import pytest +from airqo_etl_utils.config import configuration + def pytest_configure(config): config.addinivalue_line( @@ -59,12 +62,44 @@ def sample_dataframe_for_location_features(): } return pd.DataFrame(data) + @staticmethod + @pytest.fixture + def sample_hourly_forecast_data(): + return pd.DataFrame( + { + "device_id": ["dev1", "dev1", "dev2"], + "pm2_5": [10, 15, 20], + "timestamp": [ + datetime(2023, 1, 1, 0), + datetime(2023, 1, 1, 1), + datetime(2023, 1, 1, 2), + ], + } + ) -@pytest.fixture(scope="session") -def mongo_fixture(): - from airqo_etl_utils.mongo_client import MongoClient + @staticmethod + @pytest.fixture + def sample_daily_forecast_data(): + return pd.DataFrame( + { + "device_id": ["dev1", "dev1", "dev2"], + "pm2_5": [10, 15, 20], + "timestamp": [ + datetime(2023, 1, 1), + datetime(2023, 1, 2), + datetime(2023, 1, 3), + ], + } + ) - return MongoClient(uri="mongodb://localhost:27017", db_name="test_db") + @staticmethod + @pytest.fixture + def mock_db(): + mock_client = MagicMock() + mock_db = mock_client[configuration.MONGO_DATABASE_NAME] + mock_db.hourly_forecasts = MagicMock() + mock_db.daily_forecasts = MagicMock() + return mock_db class FaultDetectionFixtures: diff --git a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py index f07f58c908..d03b02d7bc 100644 --- a/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py +++ b/src/airflow/airqo_etl_utils/tests/ml_utils_tests.py @@ -134,3 +134,26 @@ def test_get_location_features(self, sample_dataframe_for_location_features): df = FUtils.get_location_features(sample_dataframe_for_location_features) for cord in ["x_cord", "y_cord", "z_cord"]: assert cord in df.columns + + @pytest.mark.xfail + @pytest.mark.parametrize( + "frequency,collection_name", + [ + ("hourly", "hourly_forecasts"), + ("daily", "daily_forecasts"), + # ("invalid", None), + ], + ) + def test_save_forecasts_to_mongo_frequency( + self, mock_db, frequency, collection_name, sample_dataframe_db + ): + if frequency == "invalid": + # Expect a ValueError for an invalid frequency + with pytest.raises(ValueError) as e: + FUtils.save_forecasts_to_mongo(sample_dataframe_db, frequency) + assert str(e.value) == f"Invalid frequency argument: {frequency}" + else: + # Expect no exception for a valid frequency + FUtils.save_forecasts_to_mongo(sample_dataframe_db, frequency) + mock_collection = getattr(mock_db, collection_name) + assert mock_collection.update_one.call_count == 0 diff --git a/src/airflow/dags/ml_prediction_jobs.py b/src/airflow/dags/ml_prediction_jobs.py index e602a0e3c1..f68a8d4c8d 100644 --- a/src/airflow/dags/ml_prediction_jobs.py +++ b/src/airflow/dags/ml_prediction_jobs.py @@ -96,6 +96,7 @@ def get_location_features_daily_forecast(data): @task() def encode_daily_categorical_features(data): return DecodingUtils.decode_categorical_features_pred(data, "daily") + @task() def make_daily_forecasts(data): return ForecastUtils.generate_forecasts( @@ -143,9 +144,7 @@ def save_daily_forecasts_to_mongo(data): daily_location_features = get_location_features_daily_forecast( daily_time_and_cyclic_features ) - daily_encoded_features = encode_daily_categorical_features( - daily_location_features - ) + daily_encoded_features = encode_daily_categorical_features(daily_location_features) daily_forecasts = make_daily_forecasts(daily_encoded_features) save_daily_forecasts_to_bigquery(daily_forecasts) save_daily_forecasts_to_mongo(daily_forecasts) From 4205c1742cfdfc1f6957a87485901a3b273ac675 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 20 Sep 2023 09:07:15 +0300 Subject: [PATCH 38/43] Update airflow staging image tag to stage-586026aa-1695189812 --- k8s/airflow/values-stage.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/airflow/values-stage.yaml b/k8s/airflow/values-stage.yaml index 0f8ec9a229..2cfb0ff91e 100644 --- a/k8s/airflow/values-stage.yaml +++ b/k8s/airflow/values-stage.yaml @@ -9,7 +9,7 @@ images: repositories: initContainer: eu.gcr.io/airqo-250220/airqo-stage-apache-airflow-xcom containers: eu.gcr.io/airqo-250220/airqo-stage-apache-airflow - tag: stage-defae719-1695039035 + tag: stage-586026aa-1695189812 nameOverride: '' fullnameOverride: '' podAnnotations: {} From 027c8570c3d973df2de80919752a0631a0a15da5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 20 Sep 2023 12:08:03 +0300 Subject: [PATCH 39/43] Update airflow staging image tag to stage-586026aa-1695189812 From 69be96b461c9c340c9a7468f65727f19a0e885b2 Mon Sep 17 00:00:00 2001 From: Benjamin Ssempala <86492979+BenjaminSsempala@users.noreply.github.com> Date: Thu, 21 Sep 2023 04:35:03 +0300 Subject: [PATCH 40/43] Add tests for data-mgt --- src/data-mgt/node/utils/log.js | 2 +- src/data-mgt/node/utils/test/ut_date.js | 97 ++++++++++++++++++++++- src/data-mgt/node/utils/test/ut_errors.js | 84 +++++++++++++++++++- src/data-mgt/node/utils/test/ut_log.js | 68 +++++++++++++++- 4 files changed, 246 insertions(+), 5 deletions(-) diff --git a/src/data-mgt/node/utils/log.js b/src/data-mgt/node/utils/log.js index 8814967bcd..9a78b39f6e 100644 --- a/src/data-mgt/node/utils/log.js +++ b/src/data-mgt/node/utils/log.js @@ -29,7 +29,7 @@ const logError = (error) => { // console.error(e); if (process.env.NODE_ENV !== "production") { console.log("an unhandled promise rejection" + ": "); - console.error(e); + console.error(error); } return "log deactivated in prod and stage"; }; diff --git a/src/data-mgt/node/utils/test/ut_date.js b/src/data-mgt/node/utils/test/ut_date.js index e75c7e3996..42b457b961 100644 --- a/src/data-mgt/node/utils/test/ut_date.js +++ b/src/data-mgt/node/utils/test/ut_date.js @@ -1 +1,96 @@ -require("module-alias/register"); +const { expect } = require('chai'); +const DateUtil = require('../date'); + +describe('Date Util', () => { + describe('generateDateFormat', () => { + it('should return a formatted date string with hours', async () => { + const ISODate = '2023-09-21T12:34:56Z'; + const result = await DateUtil.generateDateFormat(ISODate); + expect(result).to.equal('2023-09-21-12'); + }); + }); + + describe('isTimeEmpty', () => { + it('should return false for a valid time', () => { + const dateTime = '2023-09-21T12:34:56Z'; + const result = DateUtil.isTimeEmpty(dateTime); + expect(result).to.be.false; + }); + + it('should return true for an empty time', () => { + const dateTime = '2023-09-21'; + const result = DateUtil.isTimeEmpty(dateTime); + expect(result).to.be.true; + }); + }); + + describe('generateDateFormatWithoutHrs', () => { + it('should return a formatted date string without hours', () => { + const ISODate = '2023-09-21T12:34:56Z'; + const result = DateUtil.generateDateFormatWithoutHrs(ISODate); + expect(result).to.equal('2023-09-21'); + }); + }); + + describe('isDate', () => { + it('should return true for date strings with "-" or "/"', () => { + expect(DateUtil.isDate('2023-09-21')).to.be.true; + expect(DateUtil.isDate('09/21/2023')).to.be.true; + }); + + it('should return false for non-date strings', () => { + expect(DateUtil.isDate('2023')).to.be.false; + expect(DateUtil.isDate('Hello, World!')).to.be.false; + }); + }); + + describe('addMonthsToProvideDateTime', () => { + it('should add months to a provided date/time', () => { + const dateTime = '2023-09-21T12:34:56Z'; + const number = 3; + const result = DateUtil.addMonthsToProvideDateTime(dateTime, number); + expect(result).to.be.a('Date'); + }); + + it('should handle empty time and add months to date', () => { + const date = '2023-09-21'; + const number = 3; + const result = DateUtil.addMonthsToProvideDateTime(date, number); + expect(result).to.be.a('Date'); + }); + }); + + describe('monthsInfront', () => { + it('should return a date in the future with the given number of months', () => { + const number = 3; + const result = DateUtil.monthsInfront(number); + expect(result).to.be.a('Date'); + }); + }); + + describe('addDays', () => { + it('should add days to the current date', () => { + const number = 7; + const result = DateUtil.addDays(number); + expect(result).to.be.a('Date'); + }); + }); + + describe('getDifferenceInMonths', () => { + it('should calculate the difference in months between two dates', () => { + const date1 = '2023-09-21'; + const date2 = '2024-01-15'; + const result = DateUtil.getDifferenceInMonths(date1, date2); + expect(result).to.equal(4); + }); + }); + + describe('threeMonthsFromNow', () => { + it('should return a date three months from the provided date', () => { + const date = '2023-09-21'; + const result = DateUtil.threeMonthsFromNow(date); + expect(result).to.be.a('Date'); + }); + }); + +}); diff --git a/src/data-mgt/node/utils/test/ut_errors.js b/src/data-mgt/node/utils/test/ut_errors.js index e75c7e3996..6c13a790d5 100644 --- a/src/data-mgt/node/utils/test/ut_errors.js +++ b/src/data-mgt/node/utils/test/ut_errors.js @@ -1 +1,83 @@ -require("module-alias/register"); +const chai = require("chai"); +const { expect } = chai; +const sinon = require("sinon"); +const HTTPStatus = require("http-status"); +const errors = require("../errors"); + +describe("Errors Utility Functions", () => { + describe("convertErrorArrayToObject", () => { + it("should convert an array of errors to an object", () => { + const errorArray = [ + { param: "field1", msg: "Field 1 is required" }, + { param: "field2", msg: "Field 2 must be a number" }, + ]; + + const result = errors.convertErrorArrayToObject(errorArray); + + expect(result).to.deep.equal({ + field1: "Field 1 is required", + field2: "Field 2 must be a number", + }); + }); + }); + + describe("errorResponse", () => { + it("should send an error response with default status code", () => { + const res = { + status: sinon.stub().returnsThis(), + json: sinon.spy(), + }; + + errors.errorResponse({ res, message: "An error occurred" }); + + expect(res.status.calledWith(HTTPStatus.INTERNAL_SERVER_ERROR)).to.be.true; + expect(res.json.calledWithMatch({ + success: false, + message: "An error occurred", + error: { + statusCode: HTTPStatus.INTERNAL_SERVER_ERROR, + message: "An error occurred", + error: {}, + }, + })).to.be.true; + }); + + it("should send an error response with a custom status code", () => { + const res = { + status: sinon.stub().returnsThis(), + json: sinon.spy(), + }; + + errors.errorResponse({ res, message: "Bad request", statusCode: HTTPStatus.BAD_REQUEST }); + + expect(res.status.calledWith(HTTPStatus.BAD_REQUEST)).to.be.true; + expect(res.json.calledWithMatch({ + success: false, + message: "Bad request", + error: { + statusCode: HTTPStatus.BAD_REQUEST, + message: "Bad request", + error: {}, + }, + })).to.be.true; + }); + }); + + describe("badRequest", () => { + it("should send a bad request response", () => { + const res = { + status: sinon.stub().returnsThis(), + json: sinon.spy(), + }; + + errors.badRequest(res, "Bad request", { field: "Invalid input" }); + + expect(res.status.calledWith(HTTPStatus.BAD_REQUEST)).to.be.true; + expect(res.json.calledWithMatch({ + success: false, + message: "Bad request", + errors: { field: "Invalid input" }, + })).to.be.true; + }); + }); +}); diff --git a/src/data-mgt/node/utils/test/ut_log.js b/src/data-mgt/node/utils/test/ut_log.js index 74b1a1f804..e21e7d11a8 100644 --- a/src/data-mgt/node/utils/test/ut_log.js +++ b/src/data-mgt/node/utils/test/ut_log.js @@ -1,2 +1,66 @@ -require("module-alias/register"); -s; +const chai = require("chai"); +const { expect } = chai; +const sinon = require("sinon"); +const { logText, logElement, logObject, logError } = require("../log"); + +describe("Logging Utility Functions", () => { + describe("logText", () => { + it("should log a message when not in production", () => { + const consoleLogStub = sinon.stub(console, "log"); + process.env.NODE_ENV = "development"; + const result = logText("Test log message"); + expect(consoleLogStub.calledOnce).to.be.true; + expect(consoleLogStub.calledWith("Test log message")).to.be.true; + consoleLogStub.restore(); + process.env.NODE_ENV = "test"; + }); + + it("should return a log deactivation message in production", () => { + const consoleLogStub = sinon.stub(console, "log"); + process.env.NODE_ENV = "production"; + const result = logText("Test log message"); + expect(consoleLogStub.notCalled).to.be.true; + expect(result).to.equal("log deactivated in prod and stage"); + consoleLogStub.restore(); + process.env.NODE_ENV = "test"; + }); + }); + + describe("logElement", () => { + it("should log an element when not in production", () => { + const consoleLogStub = sinon.stub(console, "log"); + process.env.NODE_ENV = "development"; + const result = logElement("Test", "Element"); + expect(consoleLogStub.calledOnce).to.be.true; + expect(consoleLogStub.calledWith("Test: Element")).to.be.true; + consoleLogStub.restore(); + process.env.NODE_ENV = "test"; + }); + }); + + describe("logObject", () => { + it("should log an object when not in production", () => { + const consoleLogStub = sinon.stub(console, "log"); + process.env.NODE_ENV = "development"; + const result = logObject("Test", { key: "value" }); + expect(consoleLogStub.calledOnce).to.be.true; + expect(consoleLogStub.calledWith("Test: ")).to.be.true; + consoleLogStub.restore(); + process.env.NODE_ENV = "test"; + }); + }); + + describe("logError", () => { + it("should log an error when not in production", () => { + const consoleErrorStub = sinon.stub(console, "error"); + process.env.NODE_ENV = "development"; + const error = new Error("Test error message"); + const result = logError(error); + expect(consoleErrorStub.calledOnce).to.be.true; + expect(consoleErrorStub.calledWith(error)).to.be.true; + consoleErrorStub.restore(); + process.env.NODE_ENV = "test"; + }); + + }); +}); From 676e80f0ed7c15dab29fa0100e80e5dd06fe2303 Mon Sep 17 00:00:00 2001 From: Benjamin Ssempala <86492979+BenjaminSsempala@users.noreply.github.com> Date: Thu, 21 Sep 2023 04:44:49 +0300 Subject: [PATCH 41/43] Fixing errors for incentives tests --- src/incentives/utils/test/ut_create-transaction.js | 14 +++++++------- src/incentives/utils/test/ut_generate-filter.js | 1 + 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/incentives/utils/test/ut_create-transaction.js b/src/incentives/utils/test/ut_create-transaction.js index ee9c90f403..25bd1e4a6b 100644 --- a/src/incentives/utils/test/ut_create-transaction.js +++ b/src/incentives/utils/test/ut_create-transaction.js @@ -4,7 +4,7 @@ const chai = require("chai"); const { expect } = chai; const httpStatus = require("http-status"); -const TransactionModel = require("@models/Transaction"); +const TransactionModel = require("@models/transaction"); const createTransaction = require("@utils/create-transaction"); const axios = require("axios"); @@ -509,7 +509,7 @@ describe("createTransaction", () => { }); // Execute the function - const response = await getTransactionDetails(request); + const response = await createTransaction.getTransactionDetails(request); // Assert the response expect(response).to.deep.equal(expectedResponse); @@ -537,7 +537,7 @@ describe("createTransaction", () => { .rejects(new Error("Network Error")); // Execute the function - const response = await getTransactionDetails(request); + const response = await createTransaction.getTransactionDetails(request); // Assert the response expect(response).to.deep.equal({ @@ -601,7 +601,7 @@ describe("createTransaction", () => { }); // Execute the function - const response = await loadDataBundle(request); + const response = await createTransaction.loadDataBundle(request); // Assert the response expect(response).to.deep.equal(expectedResponse); @@ -660,7 +660,7 @@ describe("createTransaction", () => { .rejects(new Error("Network Error")); // Execute the function - const response = await loadDataBundle(request); + const response = await createTransaction.loadDataBundle(request); // Assert the response expect(response).to.deep.equal({ @@ -721,7 +721,7 @@ describe("createTransaction", () => { }; // Execute the function - const response = await checkRemainingDataBundleBalance(request); + const response = await createTransaction.checkRemainingDataBundleBalance(request); // Assert the response expect(response).to.deep.equal(expectedResponse); @@ -745,7 +745,7 @@ describe("createTransaction", () => { const throwStub = chai.spy.on(errorStub, "throw"); // Execute the function - const response = await checkRemainingDataBundleBalance(request); + const response = await createTransaction.checkRemainingDataBundleBalance(request); // Assert the response expect(response).to.deep.equal({ diff --git a/src/incentives/utils/test/ut_generate-filter.js b/src/incentives/utils/test/ut_generate-filter.js index 3e0da9e6d5..871f37bb43 100644 --- a/src/incentives/utils/test/ut_generate-filter.js +++ b/src/incentives/utils/test/ut_generate-filter.js @@ -5,6 +5,7 @@ const { expect } = chai; const generateFilter = require("@utils/generate-filter"); const mongoose = require("mongoose"); const ObjectId = mongoose.Types.ObjectId; +const httpStatus = require("http-status"); describe("generateFilter", () => { describe("hosts", () => { From 2ed3a86ba00e7db93cb5c2eb00de25bf913af8c1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 21 Sep 2023 09:56:00 +0300 Subject: [PATCH 42/43] Update data mgt staging image tag to stage-d808bb92-1695279279 --- k8s/data-mgt/values-stage.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/data-mgt/values-stage.yaml b/k8s/data-mgt/values-stage.yaml index 7b2e05ff8a..b700dd54ec 100644 --- a/k8s/data-mgt/values-stage.yaml +++ b/k8s/data-mgt/values-stage.yaml @@ -6,7 +6,7 @@ app: replicaCount: 2 image: repository: eu.gcr.io/airqo-250220/airqo-stage-data-mgt-api - tag: stage-e2c1d558-1691937865 + tag: stage-d808bb92-1695279279 nameOverride: '' fullnameOverride: '' podAnnotations: {} From c7fb17a053013a039eafde409e3421278c3adb55 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 21 Sep 2023 09:56:15 +0300 Subject: [PATCH 43/43] Update incentives staging image tag to stage-d808bb92-1695279279 --- k8s/incentives/values-stage.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/incentives/values-stage.yaml b/k8s/incentives/values-stage.yaml index 004bc0de37..6f124f67d1 100644 --- a/k8s/incentives/values-stage.yaml +++ b/k8s/incentives/values-stage.yaml @@ -6,7 +6,7 @@ app: replicaCount: 2 image: repository: eu.gcr.io/airqo-250220/airqo-stage-incentives-api - tag: stage-f7ce8287-1693130445 + tag: stage-d808bb92-1695279279 nameOverride: '' fullnameOverride: '' podAnnotations: {}