Train job keeps 3 checkpoints at a time

abisee · web-flow · commit f15e22b4c8d5 · 2017-08-16T14:46:23.000-07:00
This may be useful for recovering from NaN problems
diff --git a/run_summarization.py b/run_summarization.py
@@ -161,7 +161,7 @@ def setup_training(model, batcher):
     convert_to_coverage_model()
   if FLAGS.restore_best_model:
     restore_best_model()
-  saver = tf.train.Saver(max_to_keep=1) # only keep 1 checkpoint at a time
+  saver = tf.train.Saver(max_to_keep=3) # keep 3 checkpoints at a time
 
   sv = tf.train.Supervisor(logdir=train_dir,
                      is_chief=True,