@@ -179,6 +179,18 @@ def J(X, y, params, keep_probs):
179
179
A1 , regular_cache1 = regular_forward_prop (A0 , W1 , b1 , relu , keep_probs [1 ])
180
180
A2 , regular_cache2 = regular_forward_prop (A1 , W2 , b2 , sigmoid , 1.0 )
181
181
182
+ # print("a0 mean = " + str(np.mean(A0)))
183
+ # print("a0 var = " + str(np.var(A0)))
184
+ # print("a0 n var = " + str(1.0 / np.var(A0)))
185
+ # print("a0 max = " + str(np.max(A0)))
186
+ # print("a0 min = " + str(np.min(A0)))
187
+ #
188
+ # print("a1 mean = " + str(np.mean(A1)))
189
+ # print("a1 var = " + str(np.var(A1)))
190
+ # print("a1 n var = " + str(1.0 / np.var(A1)))
191
+ # print("a1 max = " + str(np.max(A1)))
192
+ # print("a1 min = " + str(np.min(A1)))
193
+
182
194
cost = np .sum ((- y * np .log (A2 ) - (1 - y ) * np .log (1 - A2 )), axis = 1 ) / batch_size
183
195
184
196
caches = conv_cache , regular_cache1 , regular_cache2
@@ -249,7 +261,7 @@ def random_initialization(vocab_size, embedding_size, num_filters, filter_sizes,
249
261
E = np .random .rand (vocab_size , embedding_size ) * 2 - 1
250
262
F = [np .random .randn (filter_size , embedding_size , num_filters ) * np .sqrt (6.0 / filter_size / embedding_size ) for filter_size in filter_sizes ]
251
263
b = [np .zeros ((1 , 1 , num_filters )) for i in range (total_filters )]
252
- W1 = np .random .randn (hidden_units , num_filters * total_filters ) * np .sqrt (2.0 / num_filters * total_filters )
264
+ W1 = np .random .randn (hidden_units , num_filters * total_filters ) * np .sqrt (2.0 / num_filters / total_filters )
253
265
b1 = np .zeros ((hidden_units , 1 ))
254
266
W2 = np .random .randn (1 , hidden_units ) * np .sqrt (1.0 / hidden_units )
255
267
b2 = np .zeros ((1 , 1 ))
@@ -295,8 +307,10 @@ def cnn(X_train, y_train, X_dev, y_dev, load_params_file, dump_dir, vocab_size,
295
307
for mini_batch in mini_batches :
296
308
iteration += 1
297
309
298
- # if iteration % 5 == 0:
299
- # break
310
+ # print("mean = " + str([np.mean(x) for x in params]))
311
+ # print("var = " + str([np.var(x) for x in params]))
312
+ # print("max = " + str([np.max(x) for x in params]))
313
+ # print("min = " + str([np.min(x) for x in params]))
300
314
301
315
X , y = mini_batch
302
316
@@ -321,7 +335,10 @@ def cnn(X_train, y_train, X_dev, y_dev, load_params_file, dump_dir, vocab_size,
321
335
322
336
grads = [dE ] + dF + db + [dW1 , db1 , dW2 , db2 ]
323
337
324
- # gradient_checking(params, grads, X, y, total_filters)
338
+ # print("mean g = " + str([np.mean(x) for x in grads]))
339
+ # print("var g = " + str([np.var(x) for x in grads]))
340
+ # print("max g = " + str([np.max(x) for x in grads]))
341
+ # print("min g = " + str([np.min(x) for x in grads]))
325
342
326
343
v_grads = [v * beta1 + g * (1 - beta1 ) for v , g in zip (v_grads , grads )]
327
344
s_grads = [s * beta2 + g * g * (1 - beta2 ) for s , g in zip (s_grads , grads )]
0 commit comments