Skip to content

Commit 744b3c7

Browse files
committed
Add model checkpointing with pickle
1 parent 6130427 commit 744b3c7

File tree

2 files changed

+101
-107
lines changed

2 files changed

+101
-107
lines changed

src/cnn.py

Lines changed: 60 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
import random
33
import matplotlib.pyplot as plt
44
import time
5+
import cPickle as pickle
6+
import os
7+
import math
58

69

710
def relu(a):
@@ -185,7 +188,6 @@ def J(X, y, params, keep_probs):
185188
def roll(params):
186189
rolled = np.array([])
187190
for param in params:
188-
# print(str(param.shape) + " " + str(param.size))
189191
rolled = np.append(rolled, param.reshape(-1))
190192

191193
return rolled
@@ -204,7 +206,6 @@ def params2tuple(params, total_filters):
204206

205207

206208
def unroll(rolled, params, total_filters):
207-
208209
unrolled = [None] * len(params)
209210
start = 0
210211
for i in range(len(params)):
@@ -213,44 +214,36 @@ def unroll(rolled, params, total_filters):
213214

214215
return params2tuple(unrolled, total_filters)
215216

217+
216218
def gradient_checking(params, grads, X, y, total_filters):
217219
r_params = roll(params)
218220
r_params = r_params.astype(np.float128)
219-
# print(r_params)
221+
220222
J_plus, J_minus = np.zeros((len(r_params))), np.zeros((len(r_params)))
221223
print("len of r_params = " + str(len(r_params)))
222224
for i in range(len(r_params)):
223225
original = r_params[i]
224226
r_params[i] = original + 1e-5
225-
J_plus[i], _, _ = J(X, y, unroll(r_params, params, total_filters))
227+
J_plus[i], _, _ = J(X, y, unroll(r_params, params, total_filters), [1.0, 1.0])
226228
r_params[i] = original - 1e-5
227-
J_minus[i], _, _ = J(X, y, unroll(r_params, params, total_filters))
229+
J_minus[i], _, _ = J(X, y, unroll(r_params, params, total_filters), [1.0, 1.0])
228230
r_params[i] = original
229231

230232
d_theta = roll(grads)
231233
d_theta_approx = (J_plus - J_minus) / 2 / 1e-5
232234

233-
# print(d_theta)
234-
# print(d_theta_approx)
235-
diff = (np.abs(d_theta - d_theta_approx)) # / (np.abs(d_theta) + np.abs(d_theta_approx))
236-
# print("diff = "); print(diff)
237-
238-
# print(d_theta - d_theta_approx)
239235
error = np.linalg.norm(d_theta - d_theta_approx) / (np.linalg.norm(d_theta) + np.linalg.norm(d_theta_approx))
240236
print("error = " + str(error))
241237

242238
return
243239

240+
244241
def calc_accuracy(A, Y):
245242
predictions = A > 0.5
246243
return 1.0 * np.sum(Y * predictions + (1 - Y) * (1 - predictions)) / Y.size
247244

248-
# X_train = seq_len x batch_size
249-
# y_train = 1 x batch_size
250-
def cnn(X_train, y_train, X_dev, y_dev, vocab_size, embedding_size, num_filters, filter_sizes, hidden_units, num_epochs, mini_batch_size, alpha, beta1, beta2, epsilon, keep_probs, print_cost=True, plot_cost=True):
251245

252-
np.random.seed(7)
253-
random.seed(7)
246+
def random_initialization(vocab_size, embedding_size, num_filters, filter_sizes, hidden_units):
254247
total_filters = len(filter_sizes)
255248

256249
E = np.random.rand(vocab_size, embedding_size) * 2 - 1
@@ -261,22 +254,40 @@ def cnn(X_train, y_train, X_dev, y_dev, vocab_size, embedding_size, num_filters,
261254
W2 = np.random.randn(1, hidden_units) * np.sqrt(1.0 / hidden_units)
262255
b2 = np.zeros((1, 1))
263256

264-
# gradient checking initialization
265-
# E = np.random.rand(vocab_size, embedding_size) * 2 - 1
266-
# F = [np.random.randn(filter_size, embedding_size, num_filters) * np.sqrt(6.0 / filter_size / embedding_size) for filter_size in filter_sizes]
267-
# b = [np.random.rand(1, 1, num_filters) for i in range(total_filters)]
268-
# W1 = np.random.randn(hidden_units, num_filters * total_filters) * np.sqrt(2.0 / num_filters * total_filters)
269-
# b1 = np.random.rand(hidden_units, 1)
270-
# W2 = np.random.randn(1, hidden_units) * np.sqrt(1.0 / hidden_units)
271-
# b2 = np.random.rand(1, 1)
272-
273-
params = [E] + F + b + [W1, b1, W2, b2]
274-
v_grads = [0] * len(params)
275-
s_grads = [0] * len(params)
276-
277-
iteration = 0
278-
costs = []
279-
for epoch in range(num_epochs):
257+
return [E] + F + b + [W1, b1, W2, b2]
258+
259+
260+
# X_train = seq_len x batch_size
261+
# y_train = 1 x batch_size
262+
def cnn(X_train, y_train, X_dev, y_dev, load_params_file, dump_dir, vocab_size, embedding_size,
263+
num_filters, filter_sizes, hidden_units, num_epochs, mini_batch_size, alpha, beta1, beta2,
264+
epsilon, keep_probs, plot_cost=True):
265+
266+
np.random.seed(7)
267+
random.seed(7)
268+
total_filters = len(filter_sizes)
269+
270+
if load_params_file is None:
271+
params = random_initialization(vocab_size, embedding_size, num_filters, filter_sizes, hidden_units)
272+
v_grads = [0] * len(params)
273+
s_grads = [0] * len(params)
274+
iteration = 0
275+
start_epoch = 0
276+
costs = []
277+
else:
278+
params, v_grads, s_grads, costs, iteration, start_epoch = pickle.load(open(load_params_file, "rb"))
279+
280+
hyperparams = {
281+
"load_params_file": load_params_file, "dump_dir": dump_dir, "vocab_size": vocab_size,
282+
"embedding_size": embedding_size, "num_filters": num_filters, "filter_sizes": filter_sizes,
283+
"hidden_units": hidden_units, "num_epochs": num_epochs, "mini_batch_size": mini_batch_size,
284+
"alpha": alpha, "beta1": beta1, "beta2": beta2, "epsilon": epsilon, "keep_probs": keep_probs,
285+
"plot_cost": plot_cost, "iteration": iteration, "start_epoch": start_epoch
286+
}
287+
pickle.dump(hyperparams, open(os.path.join(dump_dir, "hyperparams.txt"), "wb"))
288+
289+
print("iteration = %s start_epoch = %s" % (iteration, start_epoch))
290+
for epoch in range(start_epoch, num_epochs):
280291
mini_batches = random_split_batch(X_train, y_train, mini_batch_size)
281292

282293
epoch_cost = 0
@@ -288,13 +299,17 @@ def cnn(X_train, y_train, X_dev, y_dev, vocab_size, embedding_size, num_filters,
288299
# break
289300

290301
X, y = mini_batch
291-
(E, F, b, W1, b1, W2, b2) = params2tuple(params, total_filters)
292302

293-
cost, A2, caches = J(X, y, (E, F, b, W1, b1, W2, b2), keep_probs)
303+
cost, A2, caches = J(X, y, params2tuple(params, total_filters), keep_probs)
294304
conv_cache, regular_cache1, regular_cache2 = caches
295305

296306
train_accuracy = calc_accuracy(A2, y)
297-
print("iteration = " + str(iteration) + " cost = " + str(cost) + " train acc = " + str(train_accuracy))
307+
logging_data = "iteration = %s cost = %s train_accuracy = %s" % (iteration, cost, train_accuracy)
308+
print(logging_data)
309+
pickle.dump(logging_data, open(os.path.join(dump_dir, "log.txt"), "ab"))
310+
311+
if math.isnan(cost):
312+
return
298313

299314
epoch_cost += cost
300315
epoch_accuracy += train_accuracy
@@ -319,14 +334,20 @@ def cnn(X_train, y_train, X_dev, y_dev, vocab_size, embedding_size, num_filters,
319334
epoch_cost /= len(mini_batches)
320335
epoch_accuracy /= len(mini_batches)
321336

322-
if print_cost: #and epoch % 100 == 0:
323-
print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
324-
if print_cost: #and epoch % 5 == 0:
325-
costs.append(epoch_cost)
337+
costs.append(epoch_cost)
326338

327339
cost_dev, A2_dev, _ = J(X_dev, y_dev, params2tuple(params, total_filters), [1.0, 1.0])
328340
dev_accuracy = calc_accuracy(A2_dev, y_dev)
329341

342+
logging_data = "epoch = %s epoch_cost = %f alpha = %f epoch_accuracy = %f dev_accuracy = %f" % \
343+
(epoch, epoch_cost, alpha, epoch_accuracy, dev_accuracy)
344+
pickle.dump(logging_data, open(os.path.join(dump_dir, "log.txt"), "ab"))
345+
346+
training_data = [params, v_grads, s_grads, costs, iteration, epoch+1]
347+
pickle.dump(training_data, open(os.path.join(dump_dir, "training_" + str(epoch) + ".txt"), "wb"))
348+
349+
print("cost after epoch %i: %f" % (epoch, epoch_cost))
350+
print("alpha = " + str(alpha))
330351
print("train epoch accuracy = " + str(epoch_accuracy))
331352
print("dev accuracy = " + str(dev_accuracy))
332353

src/main.py

Lines changed: 41 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,15 @@
33
import numpy as np
44
import random
55
import cnn
6-
6+
import cPickle as pickle
7+
import os
78

89
x_text, y = load_data_and_labels("../data/rt-polaritydata/rt-polarity.pos", "../data/rt-polaritydata/rt-polarity.neg")
910
y = np.dot(y, [[0], [1]])
1011

1112
# Build vocabulary
1213
max_document_length = max([len(x.split(" ")) for x in x_text])
13-
# max_document_length = 3
14-
min_frequency = 0
15-
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length, min_frequency)
14+
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
1615
x = np.array(list(vocab_processor.fit_transform(x_text)))
1716

1817
# Randomly shuffle data
@@ -24,6 +23,10 @@
2423
# Split train/dev/test set
2524
train_size = 8000
2625
dev_size = (x.shape[0] - train_size) / 2
26+
27+
# dev_size = x.shape[0] / 10
28+
# train_size = x.shape[0] - dev_size
29+
2730
x_train, x_dev, x_test = x_shuffled[:train_size], x_shuffled[train_size:train_size+dev_size], x_shuffled[train_size+dev_size:]
2831
y_train, y_dev, y_test = y_shuffled[:train_size], y_shuffled[train_size:train_size+dev_size], y_shuffled[train_size+dev_size:]
2932

@@ -34,14 +37,14 @@
3437
embedding_size = 64
3538
num_filters = 32
3639
filter_sizes = [3, 4, 5]
37-
hidden_units = 20
40+
hidden_units = 100
3841
num_epochs = 100
39-
mini_batch_size = 128
40-
alpha = 0.01
42+
mini_batch_size = 64
43+
alpha = 0.009
4144
beta1 = 0.9
4245
beta2 = 0.999
4346
epsilon = 1e-8
44-
keep_probs = [1.0, 0.5]
47+
keep_probs = [0.5, 0.5]
4548

4649
# vocab_size = len(vocab_processor.vocabulary_)
4750
# embedding_size = 2
@@ -60,63 +63,33 @@
6063
# print(x_train)
6164
# print(y_train)
6265

63-
print("vocab_size = " + str(vocab_size))
64-
65-
params = cnn.cnn(x_train, y_train, x_dev, y_dev, vocab_size, embedding_size, num_filters, filter_sizes, hidden_units, num_epochs, mini_batch_size, alpha, beta1, beta2, epsilon, keep_probs)
66-
67-
exit(0)
68-
69-
print(max_document_length)
70-
print(x_text[0])
71-
print(x[0])
72-
73-
idx = [0, 2, 0, 4]
74-
X = [[0, 2], [0, 4], [1, 1]]
75-
E = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16], [17, 18, 19, 20]])
76-
77-
EX = E[X, :]
78-
print(EX.shape)
79-
EX = EX.reshape((6, 4))
80-
print(EX)
81-
print(EX.shape)
82-
83-
seq_len = 4
84-
filter_size = 3
85-
C = np.arange(seq_len * filter_size).reshape(seq_len, filter_size)
86-
print(C)
87-
print(C[1:4, 0:2])
88-
89-
S = np.array([np.trace(C, x) for x in range(0, -(seq_len - filter_size) - 1, -1)])
90-
print(S)
91-
92-
list = [
93-
[[1, 2], [3, 4]],
94-
[[5, 6]],
95-
[[7, 8], [9, 10], [11, 12]]
96-
]
97-
98-
m = np.concatenate(list)
99-
print(m)
100-
101-
a = np.array([1, 2, 5, 5, 2, 1])
102-
m = np.amax(a)
103-
indices = np.nonzero(a == m)[0]
104-
print(random.choice(indices))
105-
print(random.choice(indices))
106-
print(random.choice(indices))
107-
108-
X = np.array([[0, 1], [2, 2], [0, 3], [1, 1]])
109-
E = np.array([[1, 2], [5, 6], [9, 10], [13, 14]])
110-
F = [
111-
np.array([[-1, -1], [-1, 0], [0, -1]]),
112-
np.array([[0, 0], [1, 1], [0, 0]]),
113-
np.array([[1, 1], [2, 2]]),
114-
np.array([[10, 10], [1000, 1000]])
115-
]
116-
b = [1, 0, 2, 3]
117-
118-
# features, cache = cnn.conv_forward_prop(X, E, F, b)
119-
# dA = np.arange(features.shape[0] * features.shape[1]).reshape(features.shape)
120-
# cnn.conv_backward_prop(dA, cache)
121-
122-
66+
last_run = pickle.load(open("../runs/last_run.txt", "rb"))
67+
pickle.dump(last_run + 1, open("../runs/last_run.txt", "wb"))
68+
69+
dump_dir = os.path.join("../runs", str(last_run))
70+
os.makedirs(dump_dir)
71+
72+
load_params_file = None
73+
# load_params_dir = "../runs/55"
74+
# load_params_file = os.path.join(load_params_dir, "training_73.txt")
75+
# hyperparams = pickle.load(open(os.path.join(load_params_dir, "hyperparams.txt"), "rb"))
76+
# vocab_size = hyperparams["vocab_size"]
77+
# embedding_size = hyperparams["embedding_size"]
78+
# num_filters = hyperparams["num_filters"]
79+
# filter_sizes = hyperparams["filter_sizes"]
80+
# hidden_units = hyperparams["hidden_units"]
81+
# num_epochs = hyperparams["num_epochs"]
82+
# mini_batch_size = hyperparams["mini_batch_size"]
83+
# alpha = hyperparams["alpha"]
84+
# beta1 = hyperparams["beta1"]
85+
# beta2 = hyperparams["beta2"]
86+
# epsilon = hyperparams["epsilon"]
87+
# keep_probs = hyperparams["keep_probs"]
88+
89+
# override some of the hyperparams
90+
# keep_probs = [0.5, 0.5]
91+
# alpha = 0.009
92+
93+
params = cnn.cnn(x_train, y_train, x_dev, y_dev, load_params_file, dump_dir, vocab_size, embedding_size,
94+
num_filters, filter_sizes, hidden_units, num_epochs, mini_batch_size, alpha, beta1, beta2,
95+
epsilon, keep_probs)

0 commit comments

Comments
 (0)