-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
83 lines (69 loc) · 3.54 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
# 自定义 MAPE 计算函数,避免除零问题
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
non_zero_indices = y_true != 0 # 只计算非零值
return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100
# Step 1: 读取和处理数据
data = pd.read_excel('data2.xlsx') # 替换为你的 Excel 文件路径
# 处理缺失值,例如 '#DIV/0!'
data.replace({'#DIV/0!': 0}, inplace=True)
# 类别编码 for 立地タイプ (Type of location)
label_encoder = LabelEncoder()
data['立地タイプ'] = label_encoder.fit_transform(data['立地タイプ'])
# 对 曜日 (Day of the week) 进行类别编码
label_encoder_day_type = LabelEncoder()
data['曜日'] = label_encoder_day_type.fit_transform(data['曜日'])
# Step 2: 加入"駅との距離"特征并进行标准化
scaler = StandardScaler()
numeric_cols = ['バスとの距離', '駅との距離', '人口_総数_300m以内', '男性割合', '15_64人口割合', '就業者_通学者割合', '就業者_通学者利用交通手段_自転車割合']
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
# 保存 scaler 和 LabelEncoder 对象,供以后预测时使用
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')
joblib.dump(label_encoder_day_type, 'label_encoder_day_type.joblib')
# 定义特征和目标
X = data[['バスとの距離', '駅との距離', '立地タイプ', '曜日', '人口_総数_300m以内', '男性割合', '15_64人口割合', '就業者_通学者割合', '就業者_通学者利用交通手段_自転車割合']]
y = data['利用回数'] # 目标变量
# Step 3: 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 4: 定义神经网络模型
def create_mlp_model(input_dim):
model = Sequential([
Dense(128, input_dim=input_dim, activation='relu'),
Dropout(0.3),
Dense(64, activation='relu'),
Dropout(0.3),
Dense(32, activation='relu'),
Dense(1) # 回归输出层
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
return model
# Step 5: 创建并训练模型
model = create_mlp_model(X_train.shape[1])
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=[early_stopping], verbose=1)
# Step 6: 评估模型
y_pred = model.predict(X_test).flatten()
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"神经网络模型的均方误差 (MSE): {mse}")
print(f"神经网络模型的根均方误差 (RMSE): {rmse}")
print(f"神经网络模型的平均绝对误差 (MAE): {mae}")
print(f"神经网络模型的平均绝对百分比误差 (MAPE): {mape:.2f}%")
print(f"神经网络模型的 R² 分数为: {r2:.2f}")
# 保存神经网络模型
model.save('best_mlp_model.h5')
print("模型已保存为 best_mlp_model.h5")