-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
114 lines (81 loc) · 3.15 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
roc_auc_score,
)
from tqdm import tqdm
import mlflow
import mlflow.sklearn
# set registry URI i.e. where MLflow saves runs
# mlflow.set_tracking_uri("file:///Users/ahmed.besbes/projects/mlflow/mlruns")
mlflow.set_tracking_uri(
"http://ec2-35-180-45-108.eu-west-3.compute.amazonaws.com:5000/"
)
# print("artifact store : ", mlflow.get_artifact_uri())
# load data
data = pd.read_csv("./../data/aug_train.csv")
targets = data[["target"]]
data.drop(["enrollee_id", "target"], inplace=True, axis=1)
# process features
## fill in missing categorical variables and label encode them
categorical_features = []
numerical_features = []
for column in data.columns:
dtype = str(data[column].dtype)
if dtype in ["float64", "int64"]:
numerical_features.append(column)
else:
categorical_features.append(column)
for categorical_feature in categorical_features:
data[categorical_feature].fillna("missing", inplace=True)
le = LabelEncoder()
data[categorical_feature] = le.fit_transform(data[categorical_feature])
print("features processed")
# split train / test
x_train, x_test, y_train, y_test = train_test_split(
data.values,
targets.values.ravel(),
test_size=0.3,
random_state=2021,
stratify=targets.values,
)
alpha = sys.argv[0] if len(sys.argv) > 1 else 0.5
# experiment_id = mlflow.create_experiment("training experiment")
experiment_id = mlflow.set_experiment("training experiment")
n_estimators_range = np.arange(100, 500, 25)
max_depth_range = np.arange(1, 25, 2)
max_features_range = ["sqrt", None, "log2"]
for n_estimators in tqdm(n_estimators_range):
for max_depth in tqdm(max_depth_range, leave=False):
for max_features in tqdm(max_features_range, leave=False):
with mlflow.start_run(experiment_id=experiment_id):
model = RandomForestClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
max_features=max_features,
n_jobs=3,
)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
mlflow.log_param("n_estimators", n_estimators)
mlflow.log_param("max_depth", max_depth)
mlflow.log_param("max_features", max_features)
mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("precision", precision)
mlflow.log_metric("recall", recall)
mlflow.log_metric("f1", f1)
mlflow.log_metric("auc", auc)
mlflow.sklearn.log_model(model, "model")