-
Notifications
You must be signed in to change notification settings - Fork 2
/
GB_experiment.py
122 lines (108 loc) · 5.8 KB
/
GB_experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import sys
import pickle
import random
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import src.data.preprocessor as pre
import src.data.datasets as data
import src.helper as h
dataset_name = sys.argv[1]
cal_portion_str = sys.argv[2]
cal_portion = float(cal_portion_str)
exp_seed_str = sys.argv[3]
exp_seed = int(exp_seed_str)
#Set the Python seed and the NumPy seed
os.environ['PYTHONHASHSEED']=str(exp_seed)
random.seed(exp_seed)
np.random.seed(exp_seed)
#--------------------------------------------------------------------------------------------------------------------------------
#CMAPSS data removable and ignorable cols
removable_cols = ["sm01", "sm05", "sm06", "sm10", "sm16", "sm18", "sm19"]
ignore_columns = ["time", "os1", "os2", "os3"]
alpha_array = np.array([0.1, 0.15, 0.2, 0.25]) #array of miscoverage rates
quantiles = np.concatenate((alpha_array, np.array([0.5]), 1-alpha_array))
#load, split, and preprocess the specified dataset
dataset = data.get_dataset(dataset_name, MinMaxScaler(feature_range=(-1, 1)))
split_dataset = pre.split_dataset(dataset, cal_size=cal_portion, random_state=exp_seed)
proc_dataset = pre.preprocess_split(split_dataset, scaler_factory=dataset["scaler_factory"], window_size=1, removable_cols=removable_cols, ignore_columns=ignore_columns)
#X and y
X_train = proc_dataset["train"]["X"].reshape((-1,14))
y_train = proc_dataset["train"]["y"].reshape(-1)
X_cal = proc_dataset["cal"]["X"].reshape((-1,14))
y_cal = proc_dataset["cal"]["y"].reshape(-1)
idx_cal = proc_dataset["cal"]["index"]
X_test, y_test, idx_test = h.reform_test_data(proc_dataset["test"])
X_test = X_test.reshape((-1,14))
y_test = y_test.reshape(-1)
#piecewise RUL definition
y_train[y_train>125] = 125
y_cal[y_cal>125] = 125
y_test[y_test>125] = 125
all_models = {}
for q in quantiles:
gbr = HistGradientBoostingRegressor(loss="quantile", quantile=q, random_state=exp_seed)
all_models["q %1.2f" % q] = gbr.fit(X_train, y_train)
print("q", q)
print("training RMSE of GB:", np.sqrt(mean_squared_error(y_train, gbr.predict(X_train))))
print("calibration RMSE of GB:", np.sqrt(mean_squared_error(y_cal, gbr.predict(X_cal))))
print("test RMSE of GB:", np.sqrt(mean_squared_error(y_test, gbr.predict(X_test))))
gbr_ls = HistGradientBoostingRegressor(loss="squared_error", random_state=exp_seed)
all_models["mse"] = gbr_ls.fit(X_train, y_train)
print("mse")
print("training RMSE of GB:", np.sqrt(mean_squared_error(y_train, gbr_ls.predict(X_train))))
print("calibration RMSE of GB:", np.sqrt(mean_squared_error(y_cal, gbr_ls.predict(X_cal))))
print("test RMSE of GB:", np.sqrt(mean_squared_error(y_test, gbr_ls.predict(X_test))))
#--------------------------------------------------------------------------------------------------------------------------------
#Difficulty estimate of the datapoints using a simple random forest with default configurations of sklearn
y_hat_train = gbr_ls.predict(X_train)
y_hat_cal = gbr_ls.predict(X_cal)
y_hat_test = gbr_ls.predict(X_test)
res_train = np.abs(y_hat_train - y_train)
res_cal = np.abs(y_hat_cal - y_cal)
RF = RandomForestRegressor(random_state=exp_seed)
RF.fit(X_train, res_train)
print("Random forest details:")
print("average labels (training absolute residuals of GB):", res_train.mean())
print("training mean absolute error of RF:", mean_absolute_error(res_train, RF.predict(X_train)))
print("calibration mean absolute error of RF:", mean_absolute_error(res_cal, RF.predict(X_cal)))
#--------------------------------------------------------------------------------------------------------------------------------
rho = 0.99
sigma_cal = RF.predict(X_cal)
sigma_test = RF.predict(X_test)
y_hat_cal_CQR ={}
y_hat_test_CQR ={}
for q in quantiles:
y_hat_cal_CQR["q %1.2f" % q] = all_models["q %1.2f" % q].predict(X_cal)
y_hat_test_CQR["q %1.2f" % q] = all_models["q %1.2f" % q].predict(X_test)
scores = np.abs(y_cal - y_hat_cal)
scores_normalized = scores/sigma_cal
intervals_dic = {}
for alpha in alpha_array:
q = h.compute_quantile(scores, alpha)
q_array = h.compute_quantiles_nex(rho, scores.reshape((-1,1)), idx_test, idx_cal, alpha).reshape(-1)
q_normalized = h.compute_quantile(scores_normalized, alpha)
q_array_normalized = h.compute_quantiles_nex(rho, scores_normalized.reshape((-1,1)), idx_test, idx_cal, alpha).reshape(-1)
scores_low = y_hat_cal_CQR["q %1.2f" % alpha] - y_cal
scores_high = y_cal - y_hat_cal_CQR["q %1.2f" % (1-alpha)]
scores_CQR = np.maximum(scores_low, scores_high)
q_CQR = h.compute_quantile(scores_CQR, alpha)
intervals_dic_alpha = {
"SCP": (np.maximum(0,y_hat_test - q), np.minimum(125, y_hat_test + q)),
"nex-SCP": (np.maximum(0, y_hat_test - q_array), np.minimum(125, y_hat_test + q_array)),
"SCP+NNM": (np.maximum(0,y_hat_test - q_normalized*sigma_test), np.minimum(125, y_hat_test + q_normalized*sigma_test)),
"nex-SCP+NNM": (np.maximum(0, y_hat_test - q_array_normalized*sigma_test), np.minimum(125, y_hat_test + q_array_normalized*sigma_test)),
"CQR": (np.maximum(0, y_hat_test_CQR["q %1.2f" % alpha] - q_CQR), np.minimum(125, y_hat_test_CQR["q %1.2f" % (1-alpha)] + q_CQR))
}
intervals_dic[alpha] = intervals_dic_alpha
results_dic = {
"Ground truth RULs": y_test,
"Single-point RUL predictions": y_hat_test,
"Single-point RUL predictions CQR": y_hat_test_CQR["q %1.2f" % 0.5],
"intervals": intervals_dic
}
os.makedirs(os.path.join("results_GB", dataset_name, "cal_portion_"+cal_portion_str, "seed_"+exp_seed_str), exist_ok=True)
with open(os.path.join("results_GB", dataset_name, "cal_portion_"+cal_portion_str, "seed_"+exp_seed_str, "results.pkl"), 'wb') as f:
pickle.dump(results_dic, f)