-
Notifications
You must be signed in to change notification settings - Fork 1
/
training_ml_models.py
442 lines (326 loc) · 15.2 KB
/
training_ml_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
#| # Example 1: Introduction to regression
#|
#| This notebook is designed to introduce the basic concepts of training models
#| using the numpy and scipy libraries alone.
#------------------------------------------------------------------------------
#| Import the key libraries and functions.
import numpy as np
#| Load the data.
data_x, data_y = np.loadtxt('example_1_data.csv').T
#| Plot the data
import matplotlib.pyplot as plt
plt.plot(data_x, data_y, '.');
#| Define our model function
def f(x, theta):
i = np.arange(len(theta))
x = np.array(x)
return (theta * x[..., None]**i).sum(axis=-1)
#|- **Q1**: What does the `f` define?
#|- **Q2**: What does `None` slicing do?
#|- **Q3**: Why choose `...` rather than `:`?
#|- **Q4**: Why is `axis=-1` essential to the above?
#|- **(Optional exercise)**: Are you able to extend the above to `theta`?
#------------------------------------------------------------------------------
#| Now fit the model to the data by minimising a loss function
from scipy.optimize import minimize
def loss(theta):
return ((f(data_x, theta) - data_y)**2).sum()
theta0 = np.zeros(4)
sol = minimize(loss, theta0)
#|- **Q5**:
#| Plot the solution
theta = sol.x
plt.plot(data_x, data_y, '.')
x = np.linspace(-1,3,1001)
plt.plot(x, f(x, theta));
#| Consider models of increasing complexity
plt.plot(data_x, data_y, '.')
for n in [0, 1, 2, 3, 4]:
sol = minimize(loss, np.zeros(n+1))
theta = sol.x
plt.plot(x, f(x, theta));
#| **Q6**: Where is the quadratic solution? (hint, try adding a legend)
#------------------------------------------------------------------------------
#| **Q7:** What's goes wrong with the following?
plt.plot(data_x, data_y, '.')
sol = minimize(loss, np.zeros(10))
theta = sol.x
plt.plot(x, f(x, theta));
#| **Q8:** Plot the loss as a function of `n`
#| **Q9:** How could we go about choosing n? Programme something if you have time
#------------------------------------------------------------------------------
#| This dataset was generated with the code below.
#|
#| Try modifying the data and see how these observations change
np.random.seed(0)
x = np.random.uniform(-1,3,50)
y = 1 + (x-1)**3 -(x-1)
y += np.random.randn(len(y))
np.savetxt('example_1_data.csv', np.array([x, y]).T)
#| # BREAK
#|
#| ***
#------------------------------------------------------------------------------
#| # Example 2: Machine Learning with scikit learn
#|
#| https://www.kaggle.com/datasets/emmanuelfwerr/london-weather-data
#|
#| Can we predict the rain in London?
#------------------------------------------------------------------------------
#| ## Exploring the data and showcasing pandas
#| First import the data raw as downloaded from kaggle
import pandas as pd
data = pd.read_csv('london_weather.csv')
data.head()
#| We can also get some broad info, which is useful to show us how many non-null values there are
data.info()
#| The pandas describe method is useful for getting a quick overview of the stastistics of the data
data.describe()
#| Note that the counts don't agree -- this is due to missing data, mostly in the snow_depth
#| Let's drop them.
data.dropna(inplace=True)
#| We could also have also used an imputer to fill it with the median
#| **(Optional exercise)**
#|
#|`from sklearn.impute import SimpleImputer`
#------------------------------------------------------------------------------
#| The pandas hist methoda is a useful tool for showing an overview of the shape of the data (useful for eyeballing distributions & scales
data.hist();
#| Now look into correlations
corr_matrix = data.corr()
corr_matrix['sunshine'].sort_values(ascending=False)
#| Unsurprisingly sunshine correlates with temperatures and pressures, and
#| anticorrelates with rain and cloud cover
#------------------------------------------------------------------------------
#| We can also plot the full scatter matrix, which is much more involved,
#| showing the scatter plots of pairwise parameters. Note that down the
#| diagonal it shows the histograms as before
pd.plotting.scatter_matrix(data);
#| It's also relatively easy to define new variables in pandas, e.g. an integer detailing the month. Here we use the `to_datetime` functionality to convert to a column of datetimes, and the pandas `.dt.month` operation to extract the month.
data['month'] = pd.to_datetime(data.date, format='%Y%m%d').dt.month
corr_matrix = data.corr()
corr_matrix['precipitation'].sort_values(ascending=False)
#| Surprising -- quite weak correlations... but!
pd.plotting.scatter_matrix(data);
#| It's a non-linear relationship with month. Let's drop it for now
data.drop(columns='month', inplace=True)
#| ## Showcasing scikit-learn
#| Let's start by trying to predict the amount of temperature
y = data['sunshine']
X = data.drop(columns=['sunshine'])
#| Split into training and testing data using the scikit learn function
#|
#| **(Optional Exercise)**: Look into using sklearn.model_selection.StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#| Now we need to standardise the data by zero meaning and scaling to unit variance. Other options include `MaxMinScaler`, as well as custom transformations. Look into `sklearn.compose.CoumnTransformer`.
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
pipeline = StandardScaler()
pipeline.fit(X_train)
X_trans = pipeline.transform(X_train)
#| Could also run `X_trans = pipeline.fit_transform(X_train)` to 'fit' the pipeline and and scale the data simultaneously
#|
#| Now let's train a model!
#|
#| Start with something embarrassingly simple -- linear regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_trans, y_train)
#| Done! Now let's test the predictions on the _training_ set (i.e. how well it has done in-sample):
from sklearn.metrics import mean_squared_error
import numpy as np
y_pred = lin_reg.predict(X_trans)
np.sqrt(mean_squared_error(y_train, y_pred))
#| Well that's only OK when compared to the spread in y itself:
y_train.std()
#| A scatter plot of predictions is also not great
import matplotlib.pyplot as plt
plt.scatter(y_train, y_pred);
#| We're clearly doing something though!
#| Let's try something different -- a decision tree
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_trans, y_train)
y_pred = tree_reg.predict(X_trans)
np.sqrt(mean_squared_error(y_train, y_pred))
#| Well that seem fishy...
plt.scatter(y_train, y_pred);
#| This is almost certainly an example of overfitting. At this point we would be tempted to look at the testing data, but it is better to hold that back until later.
#| Let's try some cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, X_trans, y_train, cv=10, scoring="neg_mean_squared_error")
np.sqrt(-scores)
#| So out-of-sample apparently worse than the linear regression -- let's check for sure
scores = cross_val_score(lin_reg, X_trans, y_train, cv=10, scoring="neg_mean_squared_error")
np.sqrt(-scores)
#| Let's try a `RandomForestRegressor` -- tthis is effectively a stack of decision trees
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X_trans, y_train)
y_pred = forest_reg.predict(X_trans)
np.sqrt(mean_squared_error(y_train, y_pred))
#| Seems reasonable.
plt.scatter(y_train, y_pred);
#| Out of sample?
scores = cross_val_score(forest_reg, X_trans, y_train, cv=10, scoring="neg_mean_squared_error")
np.sqrt(-scores)
#| So slightly better than the linear regression, and clearly overfit in sample
#------------------------------------------------------------------------------
#| To tune the hyperparameters we can run a grid search over the hyperparameters for the `RandomForestRegressor`
from sklearn.model_selection import GridSearchCV
param_grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(X_trans, y_train)
#| We can get the best params as an attribute
grid_search.best_params_
#| and the best model
grid_search.best_estimator_
#| As well as the total grid of results
for mean_score, params in zip(grid_search.cv_results_["mean_test_score"], grid_search.cv_results_["params"]):
print(np.sqrt(-mean_score), params)
#| Also useful to look at which features are the most important
for feature, importance in zip(X_train.columns, grid_search.best_estimator_.feature_importances_):
print(feature, importance)
#| global radiation and cloud cover seem to be the most important.
#| Now try it on the test set
final_model = grid_search.best_estimator_
X_trans = pipeline.transform(X_test)
y_pred = final_model.predict(X_trans)
plt.scatter(y_test, y_pred);
#| Let's compare to lineara regression
plt.scatter(y_test, lin_reg.predict(X_trans));
plt.scatter(y_test, tree_reg.predict(X_trans));
#| Interestingly it seems that the main reason the tree does better is dealing with those outliers
#| Exercises:
#| - **Q10** How do the above change if we try to predict rainfall? temperature?
#| - **Q11** Are any other of the out-of-the-box methods any better?
#| - **Q12** Do these results change if you add in any new features e.g. month/year?
#| - **Q13** Try this analysis out using other datasets from kaggle?
#| - **Q14** How do the results change if we remove those outliers?
#| # BREAK
#|
#| ***
#------------------------------------------------------------------------------
#| # Example 3: Neural network training
#|
#| ## Keras TensorFlow sequential API
#| Let's specify an anatomy
from tensorflow import keras
model = keras.models.Sequential()
model.add(keras.layers.Input(X.shape[-1]))
model.add(keras.layers.Dense(30, activation="relu"))
model.add(keras.layers.Dense(30, activation="relu"))
model.add(keras.layers.Dense(30, activation="relu"))
model.add(keras.layers.Dense(1, activation="softplus"))
#| Alternatively we can do it in one go
#|```
#|model = keras.models.Sequential([
#| keras.layers.Input(X.shape[-1]),
#| keras.layers.Dense(30, activation="relu"),
#| keras.layers.Dense(30, activation="relu"),
#| keras.layers.Dense(1, activation="softplus")
#| ])
#|```
#| Inspect the model
model.summary()
model.summary()
#| Specify the model training procedures with 'compilation'
model.compile(loss="mean_squared_error", optimizer="sgd", metrics=["accuracy"])
#| Split into validation and testing using sklearn's `test_train_split` again
X_trans_, X_valid, y_train_, y_valid = train_test_split(pipeline.transform(X_train), y_train, test_size=0.2, random_state=43)
#| Train!
history = model.fit(X_trans_, y_train_, epochs=50, validation_data=(X_valid, y_valid))
#| Plot the history
pd.DataFrame(history.history).plot()
#| Make predictions
plt.scatter(y_test, lin_reg.predict(pipeline.transform(X_test)));
plt.scatter(y_test, model.predict(pipeline.transform(X_test)));
#| ## Keras TensorFlow functional API
#| Now let's look at the functional API, implemented for a non-sequential 'wide and deep' network
input = keras.layers.Input(X.shape[-1])
hidden1 = keras.layers.Dense(30, activation="relu")(input)
hidden2 = keras.layers.Dense(30, activation="relu")(hidden1)
concat = keras.layers.Concatenate()([input, hidden2])
output = keras.layers.Dense(1, activation="softplus")(concat)
model = keras.models.Model(inputs=[input], outputs=[output])
#| Now let's fit it
model.compile(loss="mean_squared_error", optimizer="sgd", metrics=["accuracy"])
history = model.fit(X_trans_, y_train_, epochs=50, validation_data=(X_valid, y_valid))
pd.DataFrame(history.history).plot()
#| Make predictions
plt.scatter(y_test, lin_reg.predict(pipeline.transform(X_test)));
plt.scatter(y_test, model.predict(pipeline.transform(X_test)));
#| ## Keras TensorFlow Subclssing API
#| This is useful for dynamic models. Here we implement the same 'wide and deep' network as above, but in principle this could change dynamically according to input
class WideAndDeepModel(keras.models.Model):
def __init__(self, units=30, activation="relu", **kwargs):
super().__init__(**kwargs)
self.hidden1 = keras.layers.Dense(units, activation=activation)
self.hidden2 = keras.layers.Dense(units, activation=activation)
self.output_ = keras.layers.Dense(1, activation="softplus")
def call(self, input):
hidden1 = self.hidden1(input)
hidden2 = self.hidden2(hidden1)
concat = keras.layers.Concatenate()([input, hidden2])
output = self.output_(concat)
return output
#| We define `model_` rather than `model` purely because we wish to return to the previous model earlier (subclassed models are more restrictive than functional or sequential models).
model_ = WideAndDeepModel()
model_.compile(loss="mean_squared_error", optimizer="sgd", metrics=["accuracy"])
history = model_.fit(X_trans_, y_train_, epochs=50, validation_data=(X_valid, y_valid))
#| We can inspect any weights we choose
weights, biases = model.get_layer('dense_4').get_weights()
weights
#| Saving and restoring
model.save("my_keras_model.h5")
#| Tasks
#| - **Q15** Make a more systematic comparison of all the methods losses
#| - **Q15** Explore changing the neural network architectures/training mechanisms
#| ## PyTorch
#| Define a sequential model for now
import torch
model = torch.nn.Sequential(
torch.nn.Linear(X.shape[-1],30),
torch.nn.ReLU(),
torch.nn.Linear(30,30),
torch.nn.ReLU(),
torch.nn.Linear(30,1),
torch.nn.Softplus(),
)
#| Define a mean square error loss function
loss_fn = torch.nn.MSELoss()
#| Define a stochastic gradient descent optimiser with learning rate 1e-3
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
#| (slightly verbose) mechanism for passing training data into a format that pytorch can understand.
train_dataloader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_trans_).float(), torch.tensor(y_train_.values).float()), batch_size=32)
valid_dataloader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_valid).float(), torch.tensor(y_valid.values).float()), batch_size=32)
#| The meat of the code -- train! (will discuss)
for _ in range(50):
sum_train_loss = 0
for x_batch, y_batch in train_dataloader:
pred = model(x_batch)[:, 0]
loss = loss_fn(pred, y_batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
sum_train_loss += loss.item()
sum_valid_loss = 0
with torch.no_grad():
for x_batch, y_batch in valid_dataloader:
pred = model(x_batch)[:, 0]
loss = loss_fn(pred, y_batch)
sum_valid_loss += loss.item()
print("Loss: (%.2f, %.2f)" % (sum_train_loss/len(train_dataloader),
sum_valid_loss/len(valid_dataloader)))
#| Plot the predictions
y_pred = model(torch.from_numpy(pipeline.transform(X_test)).float()).detach().numpy()
plt.scatter(y_test, y_pred);
#| Tasks
#| - **Q16** Compare and contrast the pytorch approach vs the keras approach. Which do you prefer?