-
Notifications
You must be signed in to change notification settings - Fork 0
/
train-xgboost.py
166 lines (141 loc) · 6.93 KB
/
train-xgboost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import joblib
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from matplotlib import font_manager
# 手动加载 MS Gothic 字体
font_path = "MS Gothic.ttf" # 确保路径是正确的
font_manager.fontManager.addfont(font_path)
plt.rcParams['font.family'] = 'MS Gothic'
# 自定义 MAPE 计算函数
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
non_zero_indices = y_true != 0
return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100
# Step 1: 读取和处理数据
data = pd.read_excel('data3.xlsx')
data.replace({'#DIV/0!': 0}, inplace=True)
# 将 '利用開始日' 解析为日期格式
data['利用開始日'] = pd.to_datetime(data['利用開始日'], format='%Y-%m-%d')
# 提取日期相关特征:年、月、日、星期
data['年'] = data['利用開始日'].dt.year
data['月'] = data['利用開始日'].dt.month
data['日'] = data['利用開始日'].dt.day
data['星期'] = data['利用開始日'].dt.weekday # 星期一为0,星期日为6
# 创建周期性特征
data['月_sin'] = np.sin(2 * np.pi * data['月'] / 12)
data['月_cos'] = np.cos(2 * np.pi * data['月'] / 12)
data['星期_sin'] = np.sin(2 * np.pi * data['星期'] / 7)
data['星期_cos'] = np.cos(2 * np.pi * data['星期'] / 7)
# 其他类别编码
label_encoder_type = LabelEncoder()
label_encoder_day_type = LabelEncoder()
data['曜日'] = label_encoder_day_type.fit_transform(data['曜日'])
# 特征工程:创建交互特征
data['人口_就业交互'] = data['人口_総数_300m以内'] * data['就業者_通学者割合']
data['距离交互'] = data['バスとの距離'] * data['駅との距離']
# 添加新的数值特征
new_numeric_cols = ['ポート数_300mBuffer', 'ポート数_500mBuffer', 'ポート数_1000mBuffer',
'平均気温', '降水量の合計(mm)']
missing_cols = [col for col in new_numeric_cols if col not in data.columns]
if missing_cols:
raise ValueError(f"以下新增的数值特征在数据中未找到: {missing_cols}")
# 数值特征标准化
scaler = StandardScaler()
numeric_cols = ['バスとの距離', '駅との距離', '人口_総数_300m以内', '男性割合', '15_64人口割合', '就業者_通学者割合',
'就業者_通学者利用交通手段_自転車割合', '月_sin', '月_cos', '星期_sin', '星期_cos',
'人口_就业交互', '距离交互']
numeric_cols.extend(new_numeric_cols)
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
# 保存 scaler 和 LabelEncoder 对象
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(label_encoder_type, 'label_encoder_type.joblib')
joblib.dump(label_encoder_day_type, 'label_encoder_day_type.joblib')
# 定义特征和目标
X = data[['バスとの距離', '駅との距離', '曜日',
'年', '月', '日', '月_sin', '月_cos', '星期_sin', '星期_cos',
'人口_総数_300m以内', '男性割合', '15_64人口割合', '就業者_通学者割合', '就業者_通学者利用交通手段_自転車割合',
'人口_就业交互', '距离交互'] + new_numeric_cols]
y = data['利用回数']
# 定义贝叶斯优化的搜索空间
space = {
'n_estimators': hp.choice('n_estimators', [100, 200, 300, 400]),
'max_depth': hp.quniform('max_depth', 3, 15, 1),
'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
'subsample': hp.uniform('subsample', 0.6, 1),
'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1),
'gamma': hp.uniform('gamma', 0, 0.5)
}
# 定义目标函数
def objective(params):
params['max_depth'] = int(params['max_depth'])
model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', **params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
return {'loss': mse, 'status': STATUS_OK}
# 使用贝叶斯优化和微调
num_trials = 3
best_params_list = []
for i in range(num_trials):
print(f"第 {i+1} 次模型训练")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)
best_params = {
'n_estimators': [100, 200, 300, 400][best['n_estimators']],
'max_depth': int(best['max_depth']),
'learning_rate': best['learning_rate'],
'subsample': best['subsample'],
'colsample_bytree': best['colsample_bytree'],
'gamma': best['gamma']
}
best_params_list.append(best_params)
best_model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', **best_params)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
# 添加特征重要性可视化
print(f"第 {i+1} 次模型特征重要性可视化")
# 获取特征的重要性分数
importance = best_model.get_booster().get_score(importance_type='gain')
# 过滤掉不想显示的特征
filtered_importance = {k: v for k, v in importance.items() if k not in ['月_sin', '月_cos', '星期_sin', '星期_cos']}
# 创建过滤后的图表
xgb.plot_importance(filtered_importance, importance_type='gain')
plt.show()
print(f"第 {i+1} 次模型已保存")
# 可以选择保存每次训练的模型,例如:
# joblib.dump(best_model, f'best_model_trial_{i+1}.joblib')
# 使用第三次的最佳参数进行微调
third_best_params = best_params_list[2] # 选择第三次的最佳参数
third_best_params['learning_rate'] *= 0.1 # 将学习率降低一半
third_best_params['n_estimators'] = 50000 # 增加迭代次数,进一步学习
print(f"使用第三次最佳参数 {third_best_params} 进行微调")
tuned_model = xgb.XGBRegressor(
objective='reg:squarederror',
tree_method='hist',
device='cuda',
early_stopping_rounds=500,
**third_best_params
)
evals = [(X_test, y_test)] # 评估集
tuned_model.fit(X_train, y_train, eval_set=evals, verbose=True)
tuned_y_pred = tuned_model.predict(X_test)
tuned_mse = mean_squared_error(y_test, tuned_y_pred)
tuned_rmse = np.sqrt(tuned_mse)
tuned_mae = mean_absolute_error(y_test, tuned_y_pred)
tuned_mape = mean_absolute_percentage_error(y_test, tuned_y_pred)
tuned_r2 = r2_score(y_test, tuned_y_pred)
print(f"微调后的 MSE: {tuned_mse}")
print(f"微调后的 RMSE: {tuned_rmse}")
print(f"微调后的 MAE: {tuned_mae}")
print(f"微调后的 MAPE: {tuned_mape:.2f}%")
print(f"微调后的 R²: {tuned_r2:.2f}")
# 保存最终微调后的模型
joblib.dump(tuned_model, 'final_tuned_xgb_model_third_trial.joblib')
print("微调后的模型已保存为 final_tuned_xgb_model_third_trial.joblib")