2
2
import random
3
3
import matplotlib .pyplot as plt
4
4
import time
5
+ import cPickle as pickle
6
+ import os
7
+ import math
5
8
6
9
7
10
def relu (a ):
@@ -185,7 +188,6 @@ def J(X, y, params, keep_probs):
185
188
def roll (params ):
186
189
rolled = np .array ([])
187
190
for param in params :
188
- # print(str(param.shape) + " " + str(param.size))
189
191
rolled = np .append (rolled , param .reshape (- 1 ))
190
192
191
193
return rolled
@@ -204,7 +206,6 @@ def params2tuple(params, total_filters):
204
206
205
207
206
208
def unroll (rolled , params , total_filters ):
207
-
208
209
unrolled = [None ] * len (params )
209
210
start = 0
210
211
for i in range (len (params )):
@@ -213,44 +214,36 @@ def unroll(rolled, params, total_filters):
213
214
214
215
return params2tuple (unrolled , total_filters )
215
216
217
+
216
218
def gradient_checking (params , grads , X , y , total_filters ):
217
219
r_params = roll (params )
218
220
r_params = r_params .astype (np .float128 )
219
- # print(r_params)
221
+
220
222
J_plus , J_minus = np .zeros ((len (r_params ))), np .zeros ((len (r_params )))
221
223
print ("len of r_params = " + str (len (r_params )))
222
224
for i in range (len (r_params )):
223
225
original = r_params [i ]
224
226
r_params [i ] = original + 1e-5
225
- J_plus [i ], _ , _ = J (X , y , unroll (r_params , params , total_filters ))
227
+ J_plus [i ], _ , _ = J (X , y , unroll (r_params , params , total_filters ), [ 1.0 , 1.0 ] )
226
228
r_params [i ] = original - 1e-5
227
- J_minus [i ], _ , _ = J (X , y , unroll (r_params , params , total_filters ))
229
+ J_minus [i ], _ , _ = J (X , y , unroll (r_params , params , total_filters ), [ 1.0 , 1.0 ] )
228
230
r_params [i ] = original
229
231
230
232
d_theta = roll (grads )
231
233
d_theta_approx = (J_plus - J_minus ) / 2 / 1e-5
232
234
233
- # print(d_theta)
234
- # print(d_theta_approx)
235
- diff = (np .abs (d_theta - d_theta_approx )) # / (np.abs(d_theta) + np.abs(d_theta_approx))
236
- # print("diff = "); print(diff)
237
-
238
- # print(d_theta - d_theta_approx)
239
235
error = np .linalg .norm (d_theta - d_theta_approx ) / (np .linalg .norm (d_theta ) + np .linalg .norm (d_theta_approx ))
240
236
print ("error = " + str (error ))
241
237
242
238
return
243
239
240
+
244
241
def calc_accuracy (A , Y ):
245
242
predictions = A > 0.5
246
243
return 1.0 * np .sum (Y * predictions + (1 - Y ) * (1 - predictions )) / Y .size
247
244
248
- # X_train = seq_len x batch_size
249
- # y_train = 1 x batch_size
250
- def cnn (X_train , y_train , X_dev , y_dev , vocab_size , embedding_size , num_filters , filter_sizes , hidden_units , num_epochs , mini_batch_size , alpha , beta1 , beta2 , epsilon , keep_probs , print_cost = True , plot_cost = True ):
251
245
252
- np .random .seed (7 )
253
- random .seed (7 )
246
+ def random_initialization (vocab_size , embedding_size , num_filters , filter_sizes , hidden_units ):
254
247
total_filters = len (filter_sizes )
255
248
256
249
E = np .random .rand (vocab_size , embedding_size ) * 2 - 1
@@ -261,22 +254,40 @@ def cnn(X_train, y_train, X_dev, y_dev, vocab_size, embedding_size, num_filters,
261
254
W2 = np .random .randn (1 , hidden_units ) * np .sqrt (1.0 / hidden_units )
262
255
b2 = np .zeros ((1 , 1 ))
263
256
264
- # gradient checking initialization
265
- # E = np.random.rand(vocab_size, embedding_size) * 2 - 1
266
- # F = [np.random.randn(filter_size, embedding_size, num_filters) * np.sqrt(6.0 / filter_size / embedding_size) for filter_size in filter_sizes]
267
- # b = [np.random.rand(1, 1, num_filters) for i in range(total_filters)]
268
- # W1 = np.random.randn(hidden_units, num_filters * total_filters) * np.sqrt(2.0 / num_filters * total_filters)
269
- # b1 = np.random.rand(hidden_units, 1)
270
- # W2 = np.random.randn(1, hidden_units) * np.sqrt(1.0 / hidden_units)
271
- # b2 = np.random.rand(1, 1)
272
-
273
- params = [E ] + F + b + [W1 , b1 , W2 , b2 ]
274
- v_grads = [0 ] * len (params )
275
- s_grads = [0 ] * len (params )
276
-
277
- iteration = 0
278
- costs = []
279
- for epoch in range (num_epochs ):
257
+ return [E ] + F + b + [W1 , b1 , W2 , b2 ]
258
+
259
+
260
+ # X_train = seq_len x batch_size
261
+ # y_train = 1 x batch_size
262
+ def cnn (X_train , y_train , X_dev , y_dev , load_params_file , dump_dir , vocab_size , embedding_size ,
263
+ num_filters , filter_sizes , hidden_units , num_epochs , mini_batch_size , alpha , beta1 , beta2 ,
264
+ epsilon , keep_probs , plot_cost = True ):
265
+
266
+ np .random .seed (7 )
267
+ random .seed (7 )
268
+ total_filters = len (filter_sizes )
269
+
270
+ if load_params_file is None :
271
+ params = random_initialization (vocab_size , embedding_size , num_filters , filter_sizes , hidden_units )
272
+ v_grads = [0 ] * len (params )
273
+ s_grads = [0 ] * len (params )
274
+ iteration = 0
275
+ start_epoch = 0
276
+ costs = []
277
+ else :
278
+ params , v_grads , s_grads , costs , iteration , start_epoch = pickle .load (open (load_params_file , "rb" ))
279
+
280
+ hyperparams = {
281
+ "load_params_file" : load_params_file , "dump_dir" : dump_dir , "vocab_size" : vocab_size ,
282
+ "embedding_size" : embedding_size , "num_filters" : num_filters , "filter_sizes" : filter_sizes ,
283
+ "hidden_units" : hidden_units , "num_epochs" : num_epochs , "mini_batch_size" : mini_batch_size ,
284
+ "alpha" : alpha , "beta1" : beta1 , "beta2" : beta2 , "epsilon" : epsilon , "keep_probs" : keep_probs ,
285
+ "plot_cost" : plot_cost , "iteration" : iteration , "start_epoch" : start_epoch
286
+ }
287
+ pickle .dump (hyperparams , open (os .path .join (dump_dir , "hyperparams.txt" ), "wb" ))
288
+
289
+ print ("iteration = %s start_epoch = %s" % (iteration , start_epoch ))
290
+ for epoch in range (start_epoch , num_epochs ):
280
291
mini_batches = random_split_batch (X_train , y_train , mini_batch_size )
281
292
282
293
epoch_cost = 0
@@ -288,13 +299,17 @@ def cnn(X_train, y_train, X_dev, y_dev, vocab_size, embedding_size, num_filters,
288
299
# break
289
300
290
301
X , y = mini_batch
291
- (E , F , b , W1 , b1 , W2 , b2 ) = params2tuple (params , total_filters )
292
302
293
- cost , A2 , caches = J (X , y , ( E , F , b , W1 , b1 , W2 , b2 ), keep_probs )
303
+ cost , A2 , caches = J (X , y , params2tuple ( params , total_filters ), keep_probs )
294
304
conv_cache , regular_cache1 , regular_cache2 = caches
295
305
296
306
train_accuracy = calc_accuracy (A2 , y )
297
- print ("iteration = " + str (iteration ) + " cost = " + str (cost ) + " train acc = " + str (train_accuracy ))
307
+ logging_data = "iteration = %s cost = %s train_accuracy = %s" % (iteration , cost , train_accuracy )
308
+ print (logging_data )
309
+ pickle .dump (logging_data , open (os .path .join (dump_dir , "log.txt" ), "ab" ))
310
+
311
+ if math .isnan (cost ):
312
+ return
298
313
299
314
epoch_cost += cost
300
315
epoch_accuracy += train_accuracy
@@ -319,14 +334,20 @@ def cnn(X_train, y_train, X_dev, y_dev, vocab_size, embedding_size, num_filters,
319
334
epoch_cost /= len (mini_batches )
320
335
epoch_accuracy /= len (mini_batches )
321
336
322
- if print_cost : #and epoch % 100 == 0:
323
- print ("Cost after epoch %i: %f" % (epoch , epoch_cost ))
324
- if print_cost : #and epoch % 5 == 0:
325
- costs .append (epoch_cost )
337
+ costs .append (epoch_cost )
326
338
327
339
cost_dev , A2_dev , _ = J (X_dev , y_dev , params2tuple (params , total_filters ), [1.0 , 1.0 ])
328
340
dev_accuracy = calc_accuracy (A2_dev , y_dev )
329
341
342
+ logging_data = "epoch = %s epoch_cost = %f alpha = %f epoch_accuracy = %f dev_accuracy = %f" % \
343
+ (epoch , epoch_cost , alpha , epoch_accuracy , dev_accuracy )
344
+ pickle .dump (logging_data , open (os .path .join (dump_dir , "log.txt" ), "ab" ))
345
+
346
+ training_data = [params , v_grads , s_grads , costs , iteration , epoch + 1 ]
347
+ pickle .dump (training_data , open (os .path .join (dump_dir , "training_" + str (epoch ) + ".txt" ), "wb" ))
348
+
349
+ print ("cost after epoch %i: %f" % (epoch , epoch_cost ))
350
+ print ("alpha = " + str (alpha ))
330
351
print ("train epoch accuracy = " + str (epoch_accuracy ))
331
352
print ("dev accuracy = " + str (dev_accuracy ))
332
353
0 commit comments