From a24cb3e406db84123589ab10b1f609d9117f6bf6 Mon Sep 17 00:00:00 2001 From: Norman Heckscher Date: Tue, 3 Jan 2017 07:46:21 +1000 Subject: [PATCH 01/11] Dropouts Implemented Dropouts using Tensorflow's Dropout Wrapper around rnn_cell and dropout function for embedded_input. http://www.cs.toronto.edu/%7Ersalakhu/papers/srivastava14a.pdf https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/rnn_cell.py#L708 https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/nn_ops.py#L995 Signed-off-by: Norman Heckscher --- model.py | 15 +++++++++++---- sample.py | 2 +- train.py | 2 ++ 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/model.py b/model.py index d1ea5b46..d52021dc 100644 --- a/model.py +++ b/model.py @@ -5,9 +5,9 @@ import numpy as np class Model(): - def __init__(self, args, infer=False): + def __init__(self, args, training=True): self.args = args - if infer: + if not training: args.batch_size = 1 args.seq_length = 1 @@ -22,6 +22,9 @@ def __init__(self, args, infer=False): cell = cell_fn(args.rnn_size) + if training and args.keep_prob < 1: + cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=args.keep_prob) + self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) @@ -33,7 +36,11 @@ def __init__(self, args, infer=False): softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) - inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) + inputs = tf.nn.embedding_lookup(embedding, self.input_data) + if training and args.keep_prob < 1: + inputs = tf.nn.dropout(inputs, args.keep_prob) + + inputs = tf.split(1, args.seq_length, inputs) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): @@ -41,7 +48,7 @@ def loop(prev, _): prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) - outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') + outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if not training else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) diff --git a/sample.py b/sample.py index bef448e8..849975ba 100644 --- a/sample.py +++ b/sample.py @@ -29,7 +29,7 @@ def sample(args): saved_args = cPickle.load(f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'rb') as f: words, vocab = cPickle.load(f) - model = Model(saved_args, True) + model = Model(saved_args, training=False) with tf.Session() as sess: tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) diff --git a/train.py b/train.py index 74f8026b..73d55fc8 100644 --- a/train.py +++ b/train.py @@ -36,6 +36,8 @@ def main(): help='learning rate') parser.add_argument('--decay_rate', type=float, default=0.97, help='decay rate for rmsprop') + parser.add_argument('--keep_prob', type=float, default=1.0, + help = 'probability of keeping weights in the dropout layer') parser.add_argument('--init_from', type=str, default=None, help="""continue training from saved model at this path. Path must contain files saved by previous training process: 'config.pkl' : configuration; From 5dd991fe2a52d55ced852e52376c5612ed140371 Mon Sep 17 00:00:00 2001 From: Norman Heckscher Date: Tue, 3 Jan 2017 16:08:28 +1000 Subject: [PATCH 02/11] decay Signed-off-by: Norman Heckscher --- model.py | 7 +++++-- train.py | 10 +++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/model.py b/model.py index d52021dc..4322fdfe 100644 --- a/model.py +++ b/model.py @@ -59,12 +59,15 @@ def loop(prev, _): self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length tf.summary.scalar("cost", self.cost) self.final_state = last_state - self.lr = tf.Variable(0.0, trainable=False) + self.global_step = tf.Variable(0, name='global_step', trainable=False) + self.lr = tf.train.exponential_decay(args.learning_rate, self.global_step, + args.decay_step, args.decay_rate) + tf.summary.scalar("learning_rate", self.lr) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) - self.train_op = optimizer.apply_gradients(zip(grads, tvars)) + self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step) def sample(self, sess, words, vocab, num=200, prime='first all', sampling_type=1): state = sess.run(self.cell.zero_state(1, tf.float32)) diff --git a/train.py b/train.py index 73d55fc8..c349c391 100644 --- a/train.py +++ b/train.py @@ -52,6 +52,7 @@ def main(): def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size + args.decay_step = data_loader.num_batches # check compatibility if training is continued from previously saved model if args.init_from is not None: @@ -95,7 +96,6 @@ def train(args): if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) for e in range(args.num_epochs): - sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() if args.init_from is None: assign_op = batch_pointer.assign(0) @@ -108,16 +108,16 @@ def train(args): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y, model.initial_state: state} - summary, train_loss, state, _ = sess.run([merged, model.cost, model.final_state, model.train_op], feed) + summary, lr, train_loss, state, _ = sess.run([merged, model.lr, model.cost, model.final_state, model.train_op], feed) train_writer.add_summary(summary, e * data_loader.num_batches + b) assign_op = batch_pointer.assign(data_loader.pointer) sess.run(assign_op) end = time.time() if (e * data_loader.num_batches + b) % args.batch_size == 0: - print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ - .format(e * data_loader.num_batches + b, + print("{}/{} (epoch {}), lr = {:.6f}, train_loss = {:.3f}, time/batch = {:.3f}" \ + .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, - e, train_loss, end - start)) + e, lr, train_loss, end - start)) if (e * data_loader.num_batches + b) % args.save_every == 0 \ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') From 0406b11209d8a8fad4af4072bb6f44b397bddd31 Mon Sep 17 00:00:00 2001 From: Norman Heckscher Date: Wed, 4 Jan 2017 14:26:37 +1000 Subject: [PATCH 03/11] add batch time to tensorboard Signed-off-by: Norman Heckscher --- model.py | 3 +++ train.py | 10 +++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/model.py b/model.py index 6ef7064a..129e1c0c 100644 --- a/model.py +++ b/model.py @@ -31,6 +31,9 @@ def __init__(self, args, training=True): self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) self.batch_pointer = tf.Variable(0, name="batch_pointer", trainable=False) + self.epoch_pointer = tf.Variable(0, name="epoch_pointer", trainable=False) + self.batch_time = tf.Variable(0.0, name="batch_time", trainable=False) + tf.summary.scalar("time/batch", self.batch_time) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) diff --git a/train.py b/train.py index 6a48fcb7..7c04b2a3 100644 --- a/train.py +++ b/train.py @@ -103,18 +103,22 @@ def train(args): if args.init_from is not None: data_loader.pointer = model.batch_pointer.eval() args.init_from = None + speed = 0 for b in range(data_loader.pointer, data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() - feed = {model.input_data: x, model.targets: y, model.initial_state: state, model.batch_pointer: data_loader.pointer} - summary, train_loss, lr, state, _ = sess.run([merged, model.cost, model.lr, model.final_state, model.train_op], feed) + feed = {model.input_data: x, model.targets: y, model.initial_state: state, + model.batch_pointer: data_loader.pointer, model.batch_time: speed} + summary, train_loss, lr, state, _ = sess.run([merged, model.cost, model.lr, + model.final_state, model.train_op], feed) train_writer.add_summary(summary, e * data_loader.num_batches + b) end = time.time() + speed = end - start if (e * data_loader.num_batches + b) % args.batch_size == 0: print("{}/{} (epoch {}), lr = {:.6f}, train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, - e, lr, train_loss, end - start)) + e, lr, train_loss, speed)) if (e * data_loader.num_batches + b) % args.save_every == 0 \ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') From eccfe2870e084947d29142d5cf289cd86d7401a2 Mon Sep 17 00:00:00 2001 From: Norman Heckscher Date: Wed, 4 Jan 2017 14:46:26 +1000 Subject: [PATCH 04/11] Add time/batch to TensorBoard Signed-off-by: Norman Heckscher --- model.py | 2 ++ train.py | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/model.py b/model.py index 2d1b6190..0beea379 100644 --- a/model.py +++ b/model.py @@ -31,6 +31,8 @@ def __init__(self, args, training=True): self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) self.batch_pointer = tf.Variable(0, name="batch_pointer", trainable=False) + self.batch_time = tf.Variable(0.0, name="batch_time", trainable=False) + tf.summary.scalar("time_batch", self.batch_time) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) diff --git a/train.py b/train.py index 27a22ed5..5ff208d6 100644 --- a/train.py +++ b/train.py @@ -103,18 +103,22 @@ def train(args): if args.init_from is not None: data_loader.pointer = model.batch_pointer.eval() args.init_from = None + speed = 0 for b in range(data_loader.pointer, data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() - feed = {model.input_data: x, model.targets: y, model.initial_state: state, model.batch_pointer: data_loader.pointer} - summary, train_loss, state, _ = sess.run([merged, model.cost, model.final_state, model.train_op], feed) + feed = {model.input_data: x, model.targets: y, model.initial_state: state, + model.batch_pointer: data_loader.pointer, model.batch_time: speed} + summary, train_loss, state, _ = sess.run([merged, model.cost, model.final_state, + model.train_op], feed) train_writer.add_summary(summary, e * data_loader.num_batches + b) end = time.time() + speed = end - start if (e * data_loader.num_batches + b) % args.batch_size == 0: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, - e, train_loss, end - start)) + e, train_loss, speed)) if (e * data_loader.num_batches + b) % args.save_every == 0 \ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') From 4f45c4ab4abdf972840f8b047bdf4a1eae82523e Mon Sep 17 00:00:00 2001 From: Norman Heckscher Date: Wed, 4 Jan 2017 21:13:16 +1000 Subject: [PATCH 05/11] fixes epoc save. batch save. dropout. Signed-off-by: Norman Heckscher --- model.py | 4 +++- train.py | 10 ++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/model.py b/model.py index 0beea379..c8d7df4f 100644 --- a/model.py +++ b/model.py @@ -30,7 +30,9 @@ def __init__(self, args, training=True): self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) - self.batch_pointer = tf.Variable(0, name="batch_pointer", trainable=False) + self.batch_pointer = tf.Variable(0, name="batch_pointer", trainable=False, dtype=tf.int32) + self.inc_batch_pointer_op = tf.assign(self.batch_pointer, self.batch_pointer + 1) + self.epoch_pointer = tf.Variable(0, name="epoch_pointer", trainable=False) self.batch_time = tf.Variable(0.0, name="batch_time", trainable=False) tf.summary.scalar("time_batch", self.batch_time) diff --git a/train.py b/train.py index 5ff208d6..9b58909b 100644 --- a/train.py +++ b/train.py @@ -93,12 +93,14 @@ def train(args): # restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) - for e in range(args.num_epochs): + for e in range(model.epoch_pointer.eval(), args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() if args.init_from is None: assign_op = model.batch_pointer.assign(0) sess.run(assign_op) + assign_op = model.epoch_pointer.assign(e) + sess.run(assign_op) state = sess.run(model.initial_state) if args.init_from is not None: data_loader.pointer = model.batch_pointer.eval() @@ -108,9 +110,9 @@ def train(args): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y, model.initial_state: state, - model.batch_pointer: data_loader.pointer, model.batch_time: speed} - summary, train_loss, state, _ = sess.run([merged, model.cost, model.final_state, - model.train_op], feed) + model.batch_time: speed} + summary, train_loss, state, _, _ = sess.run([merged, model.cost, model.final_state, + model.train_op, model.inc_batch_pointer_op], feed) train_writer.add_summary(summary, e * data_loader.num_batches + b) end = time.time() speed = end - start From ee226354a7acbfd3cea271ad1cb697fd5bfd4b79 Mon Sep 17 00:00:00 2001 From: Norman Heckscher Date: Wed, 4 Jan 2017 21:36:56 +1000 Subject: [PATCH 06/11] Readability Signed-off-by: Norman Heckscher --- train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index 9b58909b..501c1a5e 100644 --- a/train.py +++ b/train.py @@ -96,16 +96,16 @@ def train(args): for e in range(model.epoch_pointer.eval(), args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() + state = sess.run(model.initial_state) + speed = 0 if args.init_from is None: assign_op = model.batch_pointer.assign(0) sess.run(assign_op) assign_op = model.epoch_pointer.assign(e) sess.run(assign_op) - state = sess.run(model.initial_state) if args.init_from is not None: data_loader.pointer = model.batch_pointer.eval() args.init_from = None - speed = 0 for b in range(data_loader.pointer, data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() From 20d373ad8a18e6d6d2642d385f9939a175289000 Mon Sep 17 00:00:00 2001 From: Norman Heckscher Date: Wed, 4 Jan 2017 22:25:00 +1000 Subject: [PATCH 07/11] clean code Signed-off-by: Norman Heckscher --- train.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/train.py b/train.py index 501c1a5e..a7b99c3c 100644 --- a/train.py +++ b/train.py @@ -114,8 +114,7 @@ def train(args): summary, train_loss, state, _, _ = sess.run([merged, model.cost, model.final_state, model.train_op, model.inc_batch_pointer_op], feed) train_writer.add_summary(summary, e * data_loader.num_batches + b) - end = time.time() - speed = end - start + speed = time.time() - start if (e * data_loader.num_batches + b) % args.batch_size == 0: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, From 57b3fa1b5d1d9617095cd2531c16539817f8703d Mon Sep 17 00:00:00 2001 From: Norman Heckscher Date: Thu, 5 Jan 2017 07:23:05 +1000 Subject: [PATCH 08/11] clean code Signed-off-by: Norman Heckscher --- train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/train.py b/train.py index 94cb8560..0f8eab8c 100644 --- a/train.py +++ b/train.py @@ -116,7 +116,6 @@ def train(args): model.train_op, model.inc_batch_pointer_op], feed) train_writer.add_summary(summary, e * data_loader.num_batches + b) speed = time.time() - start - speed = end - start if (e * data_loader.num_batches + b) % args.batch_size == 0: print("{}/{} (epoch {}), lr = {:.6f}, train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, From 997a833dbd4a8dc3dd2226e96de1b5fc8cab34f5 Mon Sep 17 00:00:00 2001 From: Norman Heckscher Date: Thu, 5 Jan 2017 08:46:12 +1000 Subject: [PATCH 09/11] Tensorboard loggings Signed-off-by: Norman Heckscher --- model.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/model.py b/model.py index a1c348a3..1beaea09 100644 --- a/model.py +++ b/model.py @@ -36,9 +36,23 @@ def __init__(self, args, training=True): self.batch_time = tf.Variable(0.0, name="batch_time", trainable=False) tf.summary.scalar("time_batch", self.batch_time) + def variable_summaries(var): + """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" + with tf.name_scope('summaries'): + mean = tf.reduce_mean(var) + tf.summary.scalar('mean', mean) + #with tf.name_scope('stddev'): + # stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) + #tf.summary.scalar('stddev', stddev) + tf.summary.scalar('max', tf.reduce_max(var)) + tf.summary.scalar('min', tf.reduce_min(var)) + #tf.summary.histogram('histogram', var) + with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) + variable_summaries(softmax_w) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) + variable_summaries(softmax_b) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) From 94eb1a3fb5aa82e36f4a8aef7e870510dd8c3449 Mon Sep 17 00:00:00 2001 From: Norman Heckscher Date: Thu, 5 Jan 2017 08:48:21 +1000 Subject: [PATCH 10/11] clean from merge Signed-off-by: Norman Heckscher --- train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/train.py b/train.py index 0f8eab8c..083817d7 100644 --- a/train.py +++ b/train.py @@ -106,7 +106,6 @@ def train(args): if args.init_from is not None: data_loader.pointer = model.batch_pointer.eval() args.init_from = None - speed = 0 for b in range(data_loader.pointer, data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() From f103555dc8776ad973936fc4d8d7e6e3dfb41b7c Mon Sep 17 00:00:00 2001 From: Norman Heckscher Date: Sat, 7 Jan 2017 09:02:11 +1000 Subject: [PATCH 11/11] Allow % of GPU memory to be allocated Signed-off-by: Norman Heckscher --- train.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/train.py b/train.py index 79ba44c7..e71111cc 100644 --- a/train.py +++ b/train.py @@ -36,6 +36,8 @@ def main(): help='learning rate') parser.add_argument('--decay_rate', type=float, default=0.97, help='decay rate for rmsprop') + parser.add_argument('--gpu_mem', type=float, default=0.666, + help='% of gpu memory to be allocated to this process. Default is 66.6%') parser.add_argument('--init_from', type=str, default=None, help="""continue training from saved model at this path. Path must contain files saved by previous training process: 'config.pkl' : configuration; @@ -83,8 +85,9 @@ def train(args): merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter('logs') + gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem) - with tf.Session() as sess: + with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_writer.add_graph(sess.graph) tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables())