diff --git a/CMakeLists.txt b/CMakeLists.txt
index 748d88a351f..1403ae001ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,8 +6,16 @@ include(GNUInstallDirs)
 include(Utils)
 include(third_party/get_third_party)
 
+# Should update cmake to a more recent version which supports FindPython3.
+find_package(PythonInterp)
+if(NOT PYTHON_EXECUTABLE OR PYTHON_VERSION_MAJOR LESS 3)
+    message(WARNING "Needs python3 to auto-generate most CMake files, but not found. "
+                    "Will try `python3` directly...")
+    set(PYTHON_EXECUTABLE "python3")
+endif()
+
 message(STATUS "Running gen_cmake_skeleton.py")
-execute_process(COMMAND python
+execute_process(COMMAND ${PYTHON_EXECUTABLE}
     "${CMAKE_CURRENT_SOURCE_DIR}/cmake/gen_cmake_skeleton.py"
     "${CMAKE_CURRENT_SOURCE_DIR}/src"
     "--quiet"
@@ -28,11 +36,21 @@ if(BUILD_SHARED_LIBS)
     endif()
 endif()
 
-set(MATHLIB "OpenBLAS" CACHE STRING "OpenBLAS|MKL|Accelerate")
+if(APPLE)
+    # Use built-in BLAS on MacOS by default.
+    set(MATHLIB "Accelerate" CACHE STRING "OpenBLAS|MKL|Accelerate")
+else()
+    set(MATHLIB "OpenBLAS" CACHE STRING "OpenBLAS|MKL|Accelerate")
+endif()
 option(KALDI_BUILD_EXE "If disabled, will make add_kaldi_executable a no-op" ON)
 option(KALDI_BUILD_TEST "If disabled, will make add_kaldi_test_executable a no-op" ON)
 option(KALDI_USE_PATCH_NUMBER "Use MAJOR.MINOR.PATCH format, otherwise MAJOR.MINOR" OFF)
 
+if (KALDI_BUILD_TEST)
+    include(CTest)
+    enable_testing()
+endif()
+
 link_libraries(${CMAKE_DL_LIBS})
 
 find_package(Threads)
@@ -53,6 +71,19 @@ elseif(MATHLIB STREQUAL "MKL")
     include_directories($ENV{MKLROOT}/include) # TODO: maybe not use env, idk, find_package doesnt handle includes...
     link_libraries(${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
 elseif(MATHLIB STREQUAL "Accelerate")
+    execute_process(COMMAND sw_vers -productVersion
+        OUTPUT_VARIABLE MACOS_VERSION)
+    if(MACOS_VERSION VERSION_LESS "10.12" AND MACOS_VERSION VERSION_GREATER_EQUAL "10.11")
+        message(WARNING
+            "**BAD WARNING**: You are using OS X El Capitan.  Some versions of this OS"
+            " have a bug in the BLAS implementation that affects Kaldi."
+            " After compiling, cd to matrix/ and type 'make test'.  The"
+            " test will fail if the problem exists in your version."
+            " Eventually this issue will be fixed by system updates from"
+            " Apple.  Unexplained crashes with reports of NaNs will"
+            " be caused by this bug, but some recipes will (sometimes) work."
+        )
+    endif()
     set(BLA_VENDOR "Apple")
     find_package(BLAS REQUIRED)
     find_package(LAPACK REQUIRED)
@@ -160,6 +191,11 @@ add_subdirectory(src/kws)
 
 add_subdirectory(src/itf)
 
+if(TENSORFLOW_DIR)
+    add_subdirectory(src/tfrnnlm)
+    add_subdirectory(src/tfrnnlmbin)
+endif()
+
 # add all cuda libraries
 if(CUDA_FOUND)
     add_subdirectory(src/cudafeat)
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 88dbefdacc9..c7f45827a99 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -36,6 +36,10 @@ macro(add_kaldi_test_executable)
         cmake_parse_arguments(kaldi_test_exe "" "NAME" "SOURCES;DEPENDS" ${ARGN})
         add_executable(${kaldi_test_exe_NAME} ${kaldi_test_exe_SOURCES})
         target_link_libraries(${kaldi_test_exe_NAME} PRIVATE ${kaldi_test_exe_DEPENDS})
+        add_test(
+            NAME ${kaldi_test_exe_NAME}
+            COMMAND ${kaldi_test_exe_NAME}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
         # list(APPEND KALDI_TEST_EXECUTABLES ${kaldi_test_exe_NAME})
         install(TARGETS ${kaldi_test_exe_NAME} RUNTIME DESTINATION testbin)
 
diff --git a/cmake/gen_cmake_skeleton.py b/cmake/gen_cmake_skeleton.py
index fa506943662..8c393630cfb 100644
--- a/cmake/gen_cmake_skeleton.py
+++ b/cmake/gen_cmake_skeleton.py
@@ -180,11 +180,13 @@ def gen_code(self):
 
         if len(self.cuda_source_list) > 0:
             self.source_list.append("${CUDA_OBJS}")
-            ret.append("cuda_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..)")
-            ret.append("cuda_compile(CUDA_OBJS")
+            ret.append("if(CUDA_FOUND)")
+            ret.append("    cuda_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..)")
+            ret.append("    cuda_compile(CUDA_OBJS")
             for f in self.cuda_source_list:
-                ret.append("    " + f)
-            ret.append(")\n")
+                ret.append("        " + f)
+            ret.append("    )")
+            ret.append("endif()\n")
 
         ret.append("add_library(" + self.target_name)
         for f in self.source_list:
@@ -278,6 +280,8 @@ def write_file(self):
 
     subdirs = get_subdirectories(".")
     for d in subdirs:
+        if d.startswith('tfrnnlm'):
+            continue
         cmakelists = CMakeListsFile(d)
         if is_bin_dir(d):
             for f in get_files(d):
diff --git a/egs/wsj/s5/steps/tfrnnlm/lstm.py b/egs/wsj/s5/steps/tfrnnlm/lstm.py
index 433dc87b4c6..a66e7d69a35 100644
--- a/egs/wsj/s5/steps/tfrnnlm/lstm.py
+++ b/egs/wsj/s5/steps/tfrnnlm/lstm.py
@@ -25,32 +25,26 @@
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-import inspect
-import time
-
-import numpy as np
+import absl
+import absl.flags as flags
 import tensorflow as tf
 
 import reader
 
-flags = tf.flags
-logging = tf.logging
-
 flags.DEFINE_integer("hidden_size", 200, "hidden dim of RNN")
 
 flags.DEFINE_string("data_path", None,
                     "Where the training/test data is stored.")
 flags.DEFINE_string("vocab_path", None,
                     "Where the wordlist file is stored.")
-flags.DEFINE_string("save_path", None,
+flags.DEFINE_string("save_path", "export",
                     "Model output directory.")
 flags.DEFINE_bool("use_fp16", False,
                   "Train using 16-bit floats instead of 32bit floats")
 
 FLAGS = flags.FLAGS
 
+
 class Config(object):
   init_scale = 0.1
   learning_rate = 1.0
@@ -64,265 +58,183 @@ class Config(object):
   lr_decay = 0.5
   batch_size = 64
 
+
 def data_type():
   return tf.float16 if FLAGS.use_fp16 else tf.float32
 
 
-class RnnlmInput(object):
-  """The input data."""
+class RNNLMModel(tf.Module):
+  """The RNN model itself."""
 
-  def __init__(self, config, data, name=None):
-    self.batch_size = batch_size = config.batch_size
-    self.num_steps = num_steps = config.num_steps
-    self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
-    self.input_data, self.targets = reader.rnnlm_producer(
-        data, batch_size, num_steps, name=name)
+  def __init__(self, config, logits_bias_initializer=None):
+    super().__init__()
+    self._config = config
 
-
-class RnnlmModel(object):
-  """The RNNLM model."""
-
-  def __init__(self, is_training, config, input_):
-    self._input = input_
-
-    batch_size = input_.batch_size
-    num_steps = input_.num_steps
     size = config.hidden_size
     vocab_size = config.vocab_size
+    dt = data_type()
 
     def lstm_cell():
-      # With the latest TensorFlow source code (as of Mar 27, 2017),
-      # the BasicLSTMCell will need a reuse parameter which is unfortunately not
-      # defined in TensorFlow 1.0. To maintain backwards compatibility, we add
-      # an argument check here:
-      if 'reuse' in inspect.getargspec(
-          tf.contrib.rnn.BasicLSTMCell.__init__).args:
-        return tf.contrib.rnn.BasicLSTMCell(
-            size, forget_bias=0.0, state_is_tuple=True,
-            reuse=tf.get_variable_scope().reuse)
-      else:
-        return tf.contrib.rnn.BasicLSTMCell(
-            size, forget_bias=0.0, state_is_tuple=True)
-    attn_cell = lstm_cell
-    if is_training and config.keep_prob < 1:
-      def attn_cell():
-        return tf.contrib.rnn.DropoutWrapper(
-            lstm_cell(), output_keep_prob=config.keep_prob)
-    self.cell = tf.contrib.rnn.MultiRNNCell(
-        [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True)
-
-    self._initial_state = self.cell.zero_state(batch_size, data_type())
-    self._initial_state_single = self.cell.zero_state(1, data_type())
-
-    self.initial = tf.reshape(tf.stack(axis=0, values=self._initial_state_single), [config.num_layers, 2, 1, size], name="test_initial_state")
-
-
-    # first implement the less efficient version
-    test_word_in = tf.placeholder(tf.int32, [1, 1], name="test_word_in")
-
-    state_placeholder = tf.placeholder(tf.float32, [config.num_layers, 2, 1, size], name="test_state_in")
-    # unpacking the input state context 
-    l = tf.unstack(state_placeholder, axis=0)
-    test_input_state = tuple(
-               [tf.contrib.rnn.LSTMStateTuple(l[idx][0],l[idx][1])
-                 for idx in range(config.num_layers)]
-    )
-
-    with tf.device("/cpu:0"):
-      self.embedding = tf.get_variable(
-          "embedding", [vocab_size, size], dtype=data_type())
-
-      inputs = tf.nn.embedding_lookup(self.embedding, input_.input_data)
-      test_inputs = tf.nn.embedding_lookup(self.embedding, test_word_in)
-
-    # test time
-    with tf.variable_scope("RNN"):
-      (test_cell_output, test_output_state) = self.cell(test_inputs[:, 0, :], test_input_state)
-
-    test_state_out = tf.reshape(tf.stack(axis=0, values=test_output_state), [config.num_layers, 2, 1, size], name="test_state_out")
-    test_cell_out = tf.reshape(test_cell_output, [1, size], name="test_cell_out")
-    # above is the first part of the graph for test
-    # test-word-in
-    #               > ---- > test-state-out
-    # test-state-in        > test-cell-out
-
-
-    # below is the 2nd part of the graph for test
-    # test-word-out
-    #               > prob(word | test-word-out)
-    # test-cell-in
-
-    test_word_out = tf.placeholder(tf.int32, [1, 1], name="test_word_out")
-    cellout_placeholder = tf.placeholder(tf.float32, [1, size], name="test_cell_in")
-
-    softmax_w = tf.get_variable(
-        "softmax_w", [size, vocab_size], dtype=data_type())
-    softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())
-
-    test_logits = tf.matmul(cellout_placeholder, softmax_w) + softmax_b
-    test_softmaxed = tf.nn.log_softmax(test_logits)
-
-    p_word = test_softmaxed[0, test_word_out[0,0]]
-    test_out = tf.identity(p_word, name="test_out")
-
-    if is_training and config.keep_prob < 1:
-      inputs = tf.nn.dropout(inputs, config.keep_prob)
-
-    # Simplified version of models/tutorials/rnn/rnn.py's rnn().
-    # This builds an unrolled LSTM for tutorial purposes only.
-    # In general, use the rnn() or state_saving_rnn() from rnn.py.
-    #
-    # The alternative version of the code below is:
-    #
-    # inputs = tf.unstack(inputs, num=num_steps, axis=1)
-    # outputs, state = tf.contrib.rnn.static_rnn(
-    #     cell, inputs, initial_state=self._initial_state)
-    outputs = []
-    state = self._initial_state
-    with tf.variable_scope("RNN"):
-      for time_step in range(num_steps):
-        if time_step > -1: tf.get_variable_scope().reuse_variables()
-        (cell_output, state) = self.cell(inputs[:, time_step, :], state)
-        outputs.append(cell_output)
-
-    output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
-    logits = tf.matmul(output, softmax_w) + softmax_b
-    loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
-        [logits],
-        [tf.reshape(input_.targets, [-1])],
-        [tf.ones([batch_size * num_steps], dtype=data_type())])
-    self._cost = cost = tf.reduce_sum(loss) / batch_size
-    self._final_state = state
-
-    if not is_training:
-      return
-
-    self._lr = tf.Variable(0.0, trainable=False)
-    tvars = tf.trainable_variables()
-    grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
-                                      config.max_grad_norm)
-    optimizer = tf.train.GradientDescentOptimizer(self._lr)
-    self._train_op = optimizer.apply_gradients(
-        list(zip(grads, tvars)),
-        global_step=tf.contrib.framework.get_or_create_global_step())
-
-    self._new_lr = tf.placeholder(
-        tf.float32, shape=[], name="new_learning_rate")
-    self._lr_update = tf.assign(self._lr, self._new_lr)
-
-  def assign_lr(self, session, lr_value):
-    session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
-
-  @property
-  def input(self):
-    return self._input
-
-  @property
-  def initial_state(self):
-    return self._initial_state
-
-  @property
-  def cost(self):
-    return self._cost
-
-  @property
-  def final_state(self):
-    return self._final_state
-
-  @property
-  def lr(self):
-    return self._lr
-
-  @property
-  def train_op(self):
-    return self._train_op
-
-def run_epoch(session, model, eval_op=None, verbose=False):
-  """Runs the model on the given data."""
-  start_time = time.time()
-  costs = 0.0
-  iters = 0
-  state = session.run(model.initial_state)
-
-  fetches = {
-      "cost": model.cost,
-      "final_state": model.final_state,
-  }
-  if eval_op is not None:
-    fetches["eval_op"] = eval_op
-
-  for step in range(model.input.epoch_size):
-    feed_dict = {}
-    for i, (c, h) in enumerate(model.initial_state):
-      feed_dict[c] = state[i].c
-      feed_dict[h] = state[i].h
-
-    vals = session.run(fetches, feed_dict)
-    cost = vals["cost"]
-    state = vals["final_state"]
-
-    costs += cost
-    iters += model.input.num_steps
-
-    if verbose and step % (model.input.epoch_size // 10) == 10:
-      print("%.3f perplexity: %.3f speed: %.0f wps" %
-            (step * 1.0 / model.input.epoch_size, np.exp(costs / iters),
-             iters * model.input.batch_size / (time.time() - start_time)))
-
-  return np.exp(costs / iters)
+      return tf.keras.layers.LSTMCell(size, dtype=dt, unit_forget_bias=False)
+
+    def add_dropout(cell):
+      if config.keep_prob < 1:
+        cell = tf.nn.RNNCellDropoutWrapper(cell=cell, output_keep_prob=config.keep_prob)
+      return cell
+
+    self.embedding = tf.keras.layers.Embedding(vocab_size, size, dtype=dt)
+    self.cells = [lstm_cell() for _ in range(config.num_layers)]
+    self.rnn = tf.keras.layers.RNN(self.cells, return_sequences=True)
+
+    if logits_bias_initializer is None:
+      logits_bias_initializer = 'zeros'
+    self.fc = tf.keras.layers.Dense(vocab_size, bias_initializer=logits_bias_initializer)
+
+    # only used in training
+    self.training_cells = [add_dropout(cell) for cell in self.cells]
+    self.training_rnn = tf.keras.layers.RNN(self.training_cells, return_sequences=True)
+
+  def get_logits(self, word_ids, is_training=False):
+    rnn = self.training_rnn if is_training else self.rnn
+    inputs = self.embedding(word_ids)
+    if is_training and self._config.keep_prob < 1:
+      inputs = tf.nn.dropout(inputs, 1 - self._config.keep_prob)
+    rnn_out = rnn(inputs)
+    logits = self.fc(rnn_out)
+    return logits
+
+  def get_loss(self, word_ids, labels, is_training=False):
+    logits = self.get_logits(word_ids, is_training)
+    loss_obj = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
+    return loss_obj(labels, logits)
+
+  def get_score(self, logits):
+    """Take logits as input, output a score."""
+    return tf.nn.log_softmax(logits)
+
+  @tf.function
+  def get_initial_state(self):
+    """Exported function which emits zeroed RNN context vector."""
+    # This seems a bug in TensorFlow, but passing tf.int32 makes the state tensor also int32.
+    fake_input = tf.constant(0, dtype=tf.float32, shape=[1, 1])
+    initial_state = tf.stack(self.rnn.get_initial_state(fake_input))
+    return {"initial_state": initial_state}
+
+  @tf.function
+  def single_step(self, context, word_id):
+    """Exported function which perform one step of the RNN model."""
+    rnn = tf.keras.layers.RNN(self.cells, return_state=True)
+    context = tf.unstack(context)
+    context = [tf.unstack(c) for c in context]
+
+    inputs = self.embedding(word_id)
+    rnn_out_and_states = rnn(inputs, initial_state=context)
+
+    rnn_out = rnn_out_and_states[0]
+    rnn_states = tf.stack(rnn_out_and_states[1:])
+
+    logits = self.fc(rnn_out)
+    output = self.get_score(logits)
+    log_prob = output[0, word_id[0, 0]]
+    return {"log_prob": log_prob, "rnn_states": rnn_states, "rnn_out": rnn_out}
+
+
+class RNNLMModelTrainer(tf.Module):
+  """This class contains training code."""
+
+  def __init__(self, model: RNNLMModel, config):
+    super().__init__()
+    self.model = model
+    self.learning_rate = tf.Variable(1e-3, dtype=tf.float32, trainable=False)
+    self.optimizer = tf.optimizers.SGD(learning_rate=self.learning_rate)
+    self.max_grad_norm = config.max_grad_norm
+
+    self.eval_mean_loss = tf.metrics.Mean()
+
+  def train_one_epoch(self, data_producer, learning_rate, verbose=True):
+    print("start epoch with learning rate {}".format(learning_rate))
+    self.learning_rate.assign(learning_rate)
+
+    for i, (inputs, labels) in enumerate(data_producer.iterate()):
+      loss = self._train_step(inputs, labels)
+      if verbose and i % (data_producer.epoch_size // 10) == 1:
+        print("{}/{}: loss={}".format(i, data_producer.epoch_size, loss))
+
+  @tf.function
+  def evaluate(self, data_producer):
+    self.eval_mean_loss.reset_states()
+    for i, (inputs, labels) in enumerate(data_producer.iterate()):
+      loss = self.model.get_loss(inputs, labels)
+      self.eval_mean_loss.update_state(loss)
+
+    return self.eval_mean_loss.result()
+
+  @tf.function
+  def _train_step(self, inputs, labels):
+    with tf.GradientTape() as tape:
+      loss = self.model.get_loss(inputs, labels, is_training=True)
+
+    tvars = self.model.trainable_variables
+    grads = tape.gradient(loss, tvars)
+    clipped_grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm)
+    self.optimizer.apply_gradients(zip(clipped_grads, tvars))
+    return loss
 
 
 def get_config():
   return Config()
 
+
 def main(_):
-  if not FLAGS.data_path:
-    raise ValueError("Must set --data_path to RNNLM data directory")
+  # Turn this on to try the model code with this source file itself!
+  __TESTING = False
 
-  raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path)
-  train_data, valid_data, _, word_map = raw_data
+  if __TESTING:
+    (train_data, valid_data), word_map = reader.rnnlm_gen_data(__file__, reader.__file__)
+  else:
+    if not FLAGS.data_path:
+      raise ValueError("Must set --data_path to RNNLM data directory")
+
+    raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path)
+    train_data, valid_data, _, word_map = raw_data
 
   config = get_config()
   config.hidden_size = FLAGS.hidden_size
   config.vocab_size = len(word_map)
-  eval_config = get_config()
-  eval_config.batch_size = 1
-  eval_config.num_steps = 1
-
-  with tf.Graph().as_default():
-    initializer = tf.random_uniform_initializer(-config.init_scale,
-                                                config.init_scale)
-
-    with tf.name_scope("Train"):
-      train_input = RnnlmInput(config=config, data=train_data, name="TrainInput")
-      with tf.variable_scope("Model", reuse=None, initializer=initializer):
-        m = RnnlmModel(is_training=True, config=config, input_=train_input)
-      tf.summary.scalar("Training Loss", m.cost)
-      tf.summary.scalar("Learning Rate", m.lr)
-
-    with tf.name_scope("Valid"):
-      valid_input = RnnlmInput(config=config, data=valid_data, name="ValidInput")
-      with tf.variable_scope("Model", reuse=True, initializer=initializer):
-        mvalid = RnnlmModel(is_training=False, config=config, input_=valid_input)
-      tf.summary.scalar("Validation Loss", mvalid.cost)
-
-    sv = tf.train.Supervisor(logdir=FLAGS.save_path)
-    with sv.managed_session() as session:
-      for i in range(config.max_max_epoch):
-        lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
-        m.assign_lr(session, config.learning_rate * lr_decay)
-
-        print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
-        train_perplexity = run_epoch(session, m, eval_op=m.train_op,
-                                     verbose=True)
-
-        print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
-        valid_perplexity = run_epoch(session, mvalid)
-        print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
-
-      if FLAGS.save_path:
-        print("Saving model to %s." % FLAGS.save_path)
-        sv.saver.save(session, FLAGS.save_path)
+
+  if __TESTING:
+    # use a much smaller scale on our tiny test data
+    config.num_steps = 8
+    config.batch_size = 4
+
+  model = RNNLMModel(config)
+  train_producer = reader.RNNLMProducer(train_data, config.batch_size, config.num_steps)
+  trainer = RNNLMModelTrainer(model, config)
+
+  valid_producer = reader.RNNLMProducer(valid_data, config.batch_size, config.num_steps)
+
+  # Save variables to disk if you want to prevent crash...
+  # Data producer can also be saved to preverse feeding progress.
+  checkpoint = tf.train.Checkpoint(trainer=trainer, data_feeder=train_producer)
+  manager = tf.train.CheckpointManager(checkpoint, "checkpoints/", 5)
+
+  for i in range(config.max_max_epoch):
+    lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
+    lr = config.learning_rate * lr_decay
+    trainer.train_one_epoch(train_producer, lr)
+    manager.save()
+
+    eval_loss = trainer.evaluate(valid_producer)
+    print("validating: loss={}".format(eval_loss))
+
+  # Export
+  print("Saving model to %s." % FLAGS.save_path)
+  spec = [tf.TensorSpec(shape=[config.num_layers, 2, 1, config.hidden_size], dtype=data_type(), name="context"),
+          tf.TensorSpec(shape=[1, 1], dtype=tf.int32, name="word_id")]
+  cfunc = model.single_step.get_concrete_function(*spec)
+  cfunc2 = model.get_initial_state.get_concrete_function()
+  tf.saved_model.save(model, FLAGS.save_path, signatures={"single_step": cfunc, "get_initial_state": cfunc2})
+
 
 if __name__ == "__main__":
-  tf.app.run()
+  absl.app.run(main)
diff --git a/egs/wsj/s5/steps/tfrnnlm/lstm_fast.py b/egs/wsj/s5/steps/tfrnnlm/lstm_fast.py
index ff6c7263804..e299f449636 100644
--- a/egs/wsj/s5/steps/tfrnnlm/lstm_fast.py
+++ b/egs/wsj/s5/steps/tfrnnlm/lstm_fast.py
@@ -25,32 +25,28 @@
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
-import inspect
-import time
-
-import numpy as np
+import absl
+import absl.flags as flags
 import tensorflow as tf
+from tensorflow.python.keras.losses import LossFunctionWrapper
 
 import reader
+from lstm import RNNLMModel, RNNLMModelTrainer
 
-flags = tf.flags
-logging = tf.logging
-
-flags.DEFINE_integer("hidden_size", 200, "hidden dim of RNN")
-
-flags.DEFINE_string("data_path", None,
-                    "Where the training/test data is stored.")
-flags.DEFINE_string("vocab_path", None,
-                    "Where the wordlist file is stored.")
-flags.DEFINE_string("save_path", None,
-                    "Model output directory.")
-flags.DEFINE_bool("use_fp16", False,
-                  "Train using 16-bit floats instead of 32bit floats")
+# flags.DEFINE_integer("hidden_size", 200, "hidden dim of RNN")
+#
+# flags.DEFINE_string("data_path", None,
+#                     "Where the training/test data is stored.")
+# flags.DEFINE_string("vocab_path", None,
+#                     "Where the wordlist file is stored.")
+# flags.DEFINE_string("save_path", "export",
+#                     "Model output directory.")
+# flags.DEFINE_bool("use_fp16", False,
+#                   "Train using 16-bit floats instead of 32bit floats")
 
 FLAGS = flags.FLAGS
 
+
 class Config(object):
   """Small config."""
   init_scale = 0.1
@@ -65,280 +61,102 @@ class Config(object):
   lr_decay = 0.8
   batch_size = 64
 
+
 def data_type():
   return tf.float16 if FLAGS.use_fp16 else tf.float32
 
+
 # this new "softmax" function we show can train a "self-normalized" RNNLM where
 # the sum of the output is automatically (close to) 1.0
 # which saves a lot of computation for lattice-rescoring
 def new_softmax(labels, logits):
-  target = tf.reshape(labels, [-1])
-  f_logits = tf.exp(logits)
-  row_sums = tf.reduce_sum(f_logits, 1) # this is the negative part of the objf
-
-  t2 = tf.expand_dims(target, 1)
-  range = tf.expand_dims(tf.range(tf.shape(target)[0]), 1)
+  flatten_labels = tf.reshape(labels, [-1])
+  n_samples = tf.shape(flatten_labels)[0]
+  flatten_logits = tf.reshape(logits, shape=[n_samples, -1])
+  f_logits = tf.exp(flatten_logits)
+  row_sums = tf.reduce_sum(f_logits, -1) # this is the negative part of the objf
+
+  t2 = tf.expand_dims(flatten_labels, 1)
+  range = tf.expand_dims(tf.range(n_samples), 1)
   ind = tf.concat([range, t2], 1)
-  res = tf.gather_nd(logits, ind)
+  res = tf.gather_nd(flatten_logits, ind)
 
   return -res + row_sums - 1
 
-class RnnlmInput(object):
-  """The input data."""
-
-  def __init__(self, config, data, name=None):
-    self.batch_size = batch_size = config.batch_size
-    self.num_steps = num_steps = config.num_steps
-    self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
-    self.input_data, self.targets = reader.rnnlm_producer(
-        data, batch_size, num_steps, name=name)
-
-
-class RnnlmModel(object):
-  """The RNNLM model."""
-
-  def __init__(self, is_training, config, input_):
-    self._input = input_
-
-    batch_size = input_.batch_size
-    num_steps = input_.num_steps
-    size = config.hidden_size
-    vocab_size = config.vocab_size
-
-    def lstm_cell():
-      # With the latest TensorFlow source code (as of Mar 27, 2017),
-      # the BasicLSTMCell will need a reuse parameter which is unfortunately not
-      # defined in TensorFlow 1.0. To maintain backwards compatibility, we add
-      # an argument check here:
-      if 'reuse' in inspect.getargspec(
-          tf.contrib.rnn.BasicLSTMCell.__init__).args:
-        return tf.contrib.rnn.BasicLSTMCell(
-            size, forget_bias=0.0, state_is_tuple=True,
-            reuse=tf.get_variable_scope().reuse)
-      else:
-        return tf.contrib.rnn.BasicLSTMCell(
-            size, forget_bias=0.0, state_is_tuple=True)
-    attn_cell = lstm_cell
-    if is_training and config.keep_prob < 1:
-      def attn_cell():
-        return tf.contrib.rnn.DropoutWrapper(
-            lstm_cell(), output_keep_prob=config.keep_prob)
-    self.cell = tf.contrib.rnn.MultiRNNCell(
-        [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True)
-
-    self._initial_state = self.cell.zero_state(batch_size, data_type())
-    self._initial_state_single = self.cell.zero_state(1, data_type())
-
-    self.initial = tf.reshape(tf.stack(axis=0, values=self._initial_state_single), [config.num_layers, 2, 1, size], name="test_initial_state")
-
-    # first implement the less efficient version
-    test_word_in = tf.placeholder(tf.int32, [1, 1], name="test_word_in")
-
-    state_placeholder = tf.placeholder(tf.float32, [config.num_layers, 2, 1, size], name="test_state_in")
-    # unpacking the input state context 
-    l = tf.unstack(state_placeholder, axis=0)
-    test_input_state = tuple(
-               [tf.contrib.rnn.LSTMStateTuple(l[idx][0],l[idx][1])
-                 for idx in range(config.num_layers)]
-    )
-
-    with tf.device("/cpu:0"):
-      self.embedding = tf.get_variable(
-          "embedding", [vocab_size, size], dtype=data_type())
-
-      inputs = tf.nn.embedding_lookup(self.embedding, input_.input_data)
-      test_inputs = tf.nn.embedding_lookup(self.embedding, test_word_in)
-
-    # test time
-    with tf.variable_scope("RNN"):
-      (test_cell_output, test_output_state) = self.cell(test_inputs[:, 0, :], test_input_state)
-
-    test_state_out = tf.reshape(tf.stack(axis=0, values=test_output_state), [config.num_layers, 2, 1, size], name="test_state_out")
-    test_cell_out = tf.reshape(test_cell_output, [1, size], name="test_cell_out")
-    # above is the first part of the graph for test
-    # test-word-in
-    #               > ---- > test-state-out
-    # test-state-in        > test-cell-out
-
-
-    # below is the 2nd part of the graph for test
-    # test-word-out
-    #               > prob(word | test-word-out)
-    # test-cell-in
-
-    test_word_out = tf.placeholder(tf.int32, [1, 1], name="test_word_out")
-    cellout_placeholder = tf.placeholder(tf.float32, [1, size], name="test_cell_in")
-
-    softmax_w = tf.get_variable(
-        "softmax_w", [size, vocab_size], dtype=data_type())
-    softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())
-    softmax_b = softmax_b - 9.0
-
-    test_logits = tf.matmul(cellout_placeholder, tf.transpose(tf.nn.embedding_lookup(tf.transpose(softmax_w), test_word_out[0]))) + softmax_b[test_word_out[0,0]]
-
-    p_word = test_logits[0, 0]
-    test_out = tf.identity(p_word, name="test_out")
-
-    if is_training and config.keep_prob < 1:
-      inputs = tf.nn.dropout(inputs, config.keep_prob)
-
-    # Simplified version of models/tutorials/rnn/rnn.py's rnn().
-    # This builds an unrolled LSTM for tutorial purposes only.
-    # In general, use the rnn() or state_saving_rnn() from rnn.py.
-    #
-    # The alternative version of the code below is:
-    #
-    # inputs = tf.unstack(inputs, num=num_steps, axis=1)
-    # outputs, state = tf.contrib.rnn.static_rnn(
-    #     cell, inputs, initial_state=self._initial_state)
-    outputs = []
-    state = self._initial_state
-    with tf.variable_scope("RNN"):
-      for time_step in range(num_steps):
-        if time_step > -1: tf.get_variable_scope().reuse_variables()
-        (cell_output, state) = self.cell(inputs[:, time_step, :], state)
-        outputs.append(cell_output)
-
-    output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
-    logits = tf.matmul(output, softmax_w) + softmax_b
-    loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
-        [logits],
-        [tf.reshape(input_.targets, [-1])],
-        [tf.ones([batch_size * num_steps], dtype=data_type())],
-        softmax_loss_function=new_softmax)
-    self._cost = cost = tf.reduce_sum(loss) / batch_size
-    self._final_state = state
-
-    if not is_training:
-      return
-
-    self._lr = tf.Variable(0.0, trainable=False)
-    tvars = tf.trainable_variables()
-    grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
-                                      config.max_grad_norm)
-    optimizer = tf.train.GradientDescentOptimizer(self._lr)
-    self._train_op = optimizer.apply_gradients(
-        list(zip(grads, tvars)),
-        global_step=tf.contrib.framework.get_or_create_global_step())
-
-    self._new_lr = tf.placeholder(
-        tf.float32, shape=[], name="new_learning_rate")
-    self._lr_update = tf.assign(self._lr, self._new_lr)
-
-  def assign_lr(self, session, lr_value):
-    session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
-
-  @property
-  def input(self):
-    return self._input
-
-  @property
-  def initial_state(self):
-    return self._initial_state
-
-  @property
-  def cost(self):
-    return self._cost
-
-  @property
-  def final_state(self):
-    return self._final_state
-
-  @property
-  def lr(self):
-    return self._lr
-
-  @property
-  def train_op(self):
-    return self._train_op
-
-def run_epoch(session, model, eval_op=None, verbose=False):
-  """Runs the model on the given data."""
-  start_time = time.time()
-  costs = 0.0
-  iters = 0
-  state = session.run(model.initial_state)
-
-  fetches = {
-      "cost": model.cost,
-      "final_state": model.final_state,
-  }
-  if eval_op is not None:
-    fetches["eval_op"] = eval_op
-
-  for step in range(model.input.epoch_size):
-    feed_dict = {}
-    for i, (c, h) in enumerate(model.initial_state):
-      feed_dict[c] = state[i].c
-      feed_dict[h] = state[i].h
-
-    vals = session.run(fetches, feed_dict)
-    cost = vals["cost"]
-    state = vals["final_state"]
-
-
-    costs += cost
-    iters += model.input.num_steps
-
-    if verbose and step % (model.input.epoch_size // 10) == 10:
-      print("%.3f perplexity: %.3f speed: %.0f wps" %
-            (step * 1.0 / model.input.epoch_size, np.exp(costs / iters),
-             iters * model.input.batch_size / (time.time() - start_time)))
-
-  return np.exp(costs / iters)
+
+class MyFastLossFunction(LossFunctionWrapper):
+  def __init__(self):
+    super().__init__(new_softmax)
+
+
+class FastRNNLMModel(RNNLMModel):
+  def __init__(self, config):
+    super().__init__(config, tf.constant_initializer(-9))
+
+  def get_loss(self, word_ids, labels, is_training=False):
+    logits = self.get_logits(word_ids, is_training)
+    loss_obj = MyFastLossFunction()
+    return loss_obj(labels, logits)
+
+  def get_score(self, logits):
+    # In this implementation, logits can be used as dist output
+    return logits
 
 
 def get_config():
   return Config()
 
+
 def main(_):
-  if not FLAGS.data_path:
-    raise ValueError("Must set --data_path to RNNLM data directory")
+  # Turn this on to try the model code with this source file itself!
+  __TESTING = False
 
-  raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path)
-  train_data, valid_data, _, word_map = raw_data
+  if __TESTING:
+    (train_data, valid_data), word_map = reader.rnnlm_gen_data(__file__, reader.__file__)
+  else:
+    if not FLAGS.data_path:
+      raise ValueError("Must set --data_path to RNNLM data directory")
+
+    raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path)
+    train_data, valid_data, _, word_map = raw_data
 
   config = get_config()
   config.hidden_size = FLAGS.hidden_size
   config.vocab_size = len(word_map)
-  eval_config = get_config()
-  eval_config.batch_size = 1
-  eval_config.num_steps = 1
-
-  with tf.Graph().as_default():
-    initializer = tf.random_uniform_initializer(-config.init_scale,
-                                                config.init_scale)
-
-    with tf.name_scope("Train"):
-      train_input = RnnlmInput(config=config, data=train_data, name="TrainInput")
-      with tf.variable_scope("Model", reuse=None, initializer=initializer):
-        m = RnnlmModel(is_training=True, config=config, input_=train_input)
-      tf.summary.scalar("Training Loss", m.cost)
-      tf.summary.scalar("Learning Rate", m.lr)
-
-    with tf.name_scope("Valid"):
-      valid_input = RnnlmInput(config=config, data=valid_data, name="ValidInput")
-      with tf.variable_scope("Model", reuse=True, initializer=initializer):
-        mvalid = RnnlmModel(is_training=False, config=config, input_=valid_input)
-      tf.summary.scalar("Validation Loss", mvalid.cost)
-
-    sv = tf.train.Supervisor(logdir=FLAGS.save_path)
-    with sv.managed_session() as session:
-      for i in range(config.max_max_epoch):
-        lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
-        m.assign_lr(session, config.learning_rate * lr_decay)
-
-        print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
-        train_perplexity = run_epoch(session, m, eval_op=m.train_op,
-                                     verbose=True)
-
-        print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
-        valid_perplexity = run_epoch(session, mvalid)
-        print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
-
-      if FLAGS.save_path:
-        print("Saving model to %s." % FLAGS.save_path)
-        sv.saver.save(session, FLAGS.save_path)
+
+  if __TESTING:
+    # use a much smaller scale on our tiny test data
+    config.num_steps = 8
+    config.batch_size = 4
+
+  model = FastRNNLMModel(config)
+  train_producer = reader.RNNLMProducer(train_data, config.batch_size, config.num_steps)
+  trainer = RNNLMModelTrainer(model, config)
+
+  valid_producer = reader.RNNLMProducer(valid_data, config.batch_size, config.num_steps)
+
+  # Save variables to disk if you want to prevent crash...
+  # Data producer can also be saved to preverse feeding progress.
+  checkpoint = tf.train.Checkpoint(trainer=trainer, data_feeder=train_producer)
+  manager = tf.train.CheckpointManager(checkpoint, "checkpoints/", 5)
+
+  for i in range(config.max_max_epoch):
+    lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
+    lr = config.learning_rate * lr_decay
+    trainer.train_one_epoch(train_producer, lr)
+    manager.save()
+
+    eval_loss = trainer.evaluate(valid_producer)
+    print("validating: loss={}".format(eval_loss))
+
+  # Export
+  print("Saving model to %s." % FLAGS.save_path)
+  spec = [tf.TensorSpec(shape=[config.num_layers, 2, 1, config.hidden_size], dtype=data_type(), name="context"),
+          tf.TensorSpec(shape=[1, 1], dtype=tf.int32, name="word_id")]
+  cfunc = model.single_step.get_concrete_function(*spec)
+  cfunc2 = model.get_initial_state.get_concrete_function()
+  tf.saved_model.save(model, FLAGS.save_path, signatures={"single_step": cfunc, "get_initial_state": cfunc2})
+
 
 if __name__ == "__main__":
-  tf.app.run()
+  absl.app.run(main)
diff --git a/egs/wsj/s5/steps/tfrnnlm/reader.py b/egs/wsj/s5/steps/tfrnnlm/reader.py
index 80cdeccbb26..b0d0a7f563d 100644
--- a/egs/wsj/s5/steps/tfrnnlm/reader.py
+++ b/egs/wsj/s5/steps/tfrnnlm/reader.py
@@ -61,45 +61,61 @@ def rnnlm_raw_data(data_path, vocab_path):
   return train_data, valid_data, vocabulary, word_to_id
 
 
-def rnnlm_producer(raw_data, batch_size, num_steps, name=None):
-  """Iterate on the raw RNNLM data.
+def rnnlm_gen_data(*files):
+  """Generates data and vocab from files.
 
-  This chunks up raw_data into batches of examples and returns Tensors that
-  are drawn from these batches.
+  This function is used solely for testing.
+  """
+  import collections
+  import re
 
-  Args:
-    raw_data: one of the raw data outputs from rnnlm_raw_data.
-    batch_size: int, the batch size.
-    num_steps: int, the number of unrolls.
-    name: the name of this operation (optional).
+  all_words = collections.Counter()
+  all_word_lists = []
+  for f in files:
+    with open(f, mode="r") as fp:
+      text = fp.read()
 
-  Returns:
-    A pair of Tensors, each shaped [batch_size, num_steps]. The second element
-    of the tuple is the same data time-shifted to the right by one.
+    word_list = re.split("[^A-Za-z]", text)
+    word_list = list(filter(None, word_list))
+    all_words.update(word_list)
+    all_word_lists.append(word_list)
 
-  Raises:
-    tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.
-  """
-  with tf.name_scope(name, "RNNLMProducer", [raw_data, batch_size, num_steps]):
-    raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32)
-
-    data_len = tf.size(raw_data)
-    batch_len = data_len // batch_size
-    data = tf.reshape(raw_data[0 : batch_size * batch_len],
-                      [batch_size, batch_len])
-
-    epoch_size = (batch_len - 1) // num_steps
-    assertion = tf.assert_positive(
-        epoch_size,
-        message="epoch_size == 0, decrease batch_size or num_steps")
-    with tf.control_dependencies([assertion]):
-      epoch_size = tf.identity(epoch_size, name="epoch_size")
-
-    i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
-    x = tf.strided_slice(data, [0, i * num_steps],
-                         [batch_size, (i + 1) * num_steps])
-    x.set_shape([batch_size, num_steps])
-    y = tf.strided_slice(data, [0, i * num_steps + 1],
-                         [batch_size, (i + 1) * num_steps + 1])
-    y.set_shape([batch_size, num_steps])
-    return x, y
+  word_to_id = {word: i for i, (word, _) in enumerate(all_words.most_common())}
+
+  def convert(word_list):
+    return [word_to_id[word] for word in word_list]
+
+  all_word_ids = [convert(word_list) for word_list in all_word_lists]
+  return all_word_ids, word_to_id
+
+
+class RNNLMProducer(tf.Module):
+  """This is the data feeder."""
+
+  def __init__(self, raw_data, batch_size, num_steps, name=None):
+    super().__init__(name)
+    self.batch_size = batch_size
+    self.num_steps = num_steps
+    self.epoch_size = (len(raw_data) - 1) // num_steps // batch_size
+
+    # load data into a variable so that it will be separated from graph
+    self._raw_data = tf.Variable(raw_data, dtype=tf.int32, trainable=False)
+
+    ds_x = tf.data.Dataset.from_tensor_slices(self._raw_data)
+    ds_y = ds_x.skip(1)
+    ds = tf.data.Dataset.zip((ds_x, ds_y))
+    # form samples
+    ds = ds.batch(num_steps, drop_remainder=True)
+    # form batches
+    self._ds = ds.batch(batch_size, drop_remainder=True)
+
+  def iterate(self):
+    return self._ds
+
+
+if __name__ == "__main__":
+  samples = list(range(100))
+  ds = RNNLMProducer(samples, 4, 8)
+  print(ds.epoch_size)
+  for data in ds.iterate():
+    print(data)
diff --git a/src/tfrnnlm/CMakeLists.txt b/src/tfrnnlm/CMakeLists.txt
new file mode 100644
index 00000000000..3b0b9aaa10a
--- /dev/null
+++ b/src/tfrnnlm/CMakeLists.txt
@@ -0,0 +1,49 @@
+set(PUBLIC_HEADERS
+    tensorflow-rnnlm.h
+)
+
+add_library(kaldi-tfrnnlm
+    tensorflow-rnnlm.cc
+)
+
+if(NOT EXISTS ${TENSORFLOW_DIR}/bazel-bin/tensorflow/libtensorflow_framework.so
+    OR NOT EXISTS ${TENSORFLOW_DIR}/bazel-bin/tensorflow/libtensorflow_cc.so)
+    message(FATAL_ERROR "TensorFlow components are not built, please build TensorFlow first.")
+endif()
+
+target_include_directories(kaldi-tfrnnlm PUBLIC 
+    ${TENSORFLOW_DIR}/bazel-tensorflow/external/com_google_protobuf/src
+    ${TENSORFLOW_DIR}/bazel-genfiles
+    ${TENSORFLOW_DIR}
+    ${TENSORFLOW_DIR}/tensorflow/lite/tools/make/downloads/eigen
+    ${TENSORFLOW_DIR}/tensorflow/lite/tools/make/downloads/absl
+)
+
+target_include_directories(kaldi-tfrnnlm PUBLIC 
+     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/..>
+     $<INSTALL_INTERFACE:include/kaldi>
+)
+
+target_link_libraries(kaldi-tfrnnlm PUBLIC
+    kaldi-lm
+    kaldi-util
+    kaldi-matrix
+    kaldi-base
+    -lz
+    -ldl
+    -fPIC
+    -L${TENSORFLOW_DIR}/bazel-bin/tensorflow
+    -ltensorflow_cc
+    -ltensorflow_framework
+)
+
+
+install(TARGETS kaldi-tfrnnlm
+    EXPORT kaldi-targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
+
+install(FILES ${PUBLIC_HEADERS} DESTINATION include/kaldi/tfrnnlm)
+
diff --git a/src/tfrnnlm/tensorflow-rnnlm.cc b/src/tfrnnlm/tensorflow-rnnlm.cc
index e4de98abd12..2f9268fa790 100644
--- a/src/tfrnnlm/tensorflow-rnnlm.cc
+++ b/src/tfrnnlm/tensorflow-rnnlm.cc
@@ -27,6 +27,8 @@
 
 // Tensorflow includes were moved after tfrnnlm/tensorflow-rnnlm.h include to
 // avoid macro redefinitions. See also the note in tfrnnlm/tensorflow-rnnlm.h.
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/cc/saved_model/tag_constants.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
@@ -64,42 +66,75 @@ void SetUnkPenalties(const string &filename,
 // Read tensorflow checkpoint files
 void KaldiTfRnnlmWrapper::ReadTfModel(const std::string &tf_model_path,
                                       int32 num_threads) {
-  string graph_path = tf_model_path + ".meta";
-
   tensorflow::SessionOptions session_options;
+  tensorflow::RunOptions run_options;
 
   session_options.config.set_intra_op_parallelism_threads(num_threads);
   session_options.config.set_inter_op_parallelism_threads(num_threads);
 
-  Status status = tensorflow::NewSession(session_options,
-                                         &session_);
+  Status status = tensorflow::LoadSavedModel(
+      session_options, run_options, tf_model_path,
+      {tensorflow::kSavedModelTagServe},
+      &bundle_);
   if (!status.ok()) {
     KALDI_ERR << status.ToString();
   }
 
-  tensorflow::MetaGraphDef graph_def;
-  status = tensorflow::ReadBinaryProto(tensorflow::Env::Default(), graph_path,
-                                       &graph_def);
-  if (!status.ok()) {
-    KALDI_ERR << status.ToString();
+  // SavedModel maintains a list of "exported function signature" in its metadata.
+  // We are going to read it and get actual tensor name.
+  auto&& signature_map = bundle_.meta_graph_def.signature_def();
+  auto signature_it = signature_map.find("single_step");
+  if (signature_it == signature_map.end()) {
+    KALDI_ERR << "Cannot find signature `single_step' in SavedModel.";
   }
 
-  // Add the graph to the session
-  status = session_->Create(graph_def.graph_def());
-  if (!status.ok()) {
-    KALDI_ERR << status.ToString();
+  auto&& signature = signature_it->second;
+
+  const std::vector<std::pair<const char*, std::string&>> input_params = {
+    {"context", context_tensor_name_},
+    {"word_id", word_id_tensor_name_},
+  };
+
+  for (auto&& pair : input_params) {
+    auto&& map = signature.inputs();
+    auto param_it = map.find(pair.first);
+    if (param_it == map.end()) {
+      KALDI_ERR << "Cannot find input param `" << pair.first << "' in signature, abort.";
+    }
+    pair.second = param_it->second.name();
+    // printf("%s: %s\n", pair.first, pair.second.c_str());
   }
 
-  Tensor checkpointPathTensor(tensorflow::DT_STRING, tensorflow::TensorShape());
-  checkpointPathTensor.scalar<std::string>()() = tf_model_path;
+  const std::vector<std::pair<const char*, std::string&>> output_params = {
+    {"log_prob", log_prob_tensor_name_},
+    {"rnn_out", rnn_out_tensor_name_},
+    {"rnn_states", rnn_states_tensor_name_},
+  };
+
+  for (auto&& pair : output_params) {
+    auto&& map = signature.outputs();
+    auto param_it = map.find(pair.first);
+    if (param_it == map.end()) {
+      KALDI_ERR << "Cannot find output param `" << pair.first << "' in signature, abort.";
+    }
+    pair.second = param_it->second.name();
+    // printf("%s: %s\n", pair.first, pair.second.c_str());
+  }
 
-  status = session_->Run(
-      {{graph_def.saver_def().filename_tensor_name(), checkpointPathTensor} },
-      {},
-      {graph_def.saver_def().restore_op_name()},
-      nullptr);
-  if (!status.ok()) {
-    KALDI_ERR << status.ToString();
+  // We have another function which only emit initial RNN state
+  signature_it = signature_map.find("get_initial_state");
+  if (signature_it == signature_map.end()) {
+    KALDI_ERR << "Cannot find signature `get_initial_state' in SavedModel.";
+  }
+
+  {
+    auto&& signature = signature_it->second;
+    auto&& map = signature.outputs();
+    auto param_it = map.find("initial_state");
+    if (param_it == map.end()) {
+      KALDI_ERR << "Cannot find output param `initial_state' in signature, abort.";
+    }
+    initial_state_tensor_name_ = param_it->second.name();
   }
 }
 
@@ -177,13 +212,16 @@ KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper(
   delete fst_word_symbols;
 }
 
+KaldiTfRnnlmWrapper::~KaldiTfRnnlmWrapper() {
+}
+
 void KaldiTfRnnlmWrapper::AcquireInitialTensors() {
   Status status;
   // get the initial context; this is basically the all-0 tensor
   {
     std::vector<Tensor> state;
-    status = session_->Run(std::vector<std::pair<string, Tensor> >(),
-                           {"Train/Model/test_initial_state"}, {}, &state);
+    status = bundle_.session->Run(std::vector<std::pair<string, Tensor> >(),
+                           {initial_state_tensor_name_}, {}, &state);
     if (!status.ok()) {
       KALDI_ERR << status.ToString();
     }
@@ -197,11 +235,11 @@ void KaldiTfRnnlmWrapper::AcquireInitialTensors() {
     bosword.scalar<int32>()() = eos_;  // eos_ is more like a sentence boundary
 
     std::vector<std::pair<string, Tensor> > inputs = {
-      {"Train/Model/test_word_in", bosword},
-      {"Train/Model/test_state_in", initial_context_},
+      {word_id_tensor_name_, bosword},
+      {context_tensor_name_, initial_context_},
     };
 
-    status = session_->Run(inputs, {"Train/Model/test_cell_out"}, {}, &state);
+    status = bundle_.session->Run(inputs, {rnn_out_tensor_name_}, {}, &state);
     if (!status.ok()) {
       KALDI_ERR << status.ToString();
     }
@@ -215,27 +253,23 @@ BaseFloat KaldiTfRnnlmWrapper::GetLogProb(int32 word,
                                           const Tensor &cell_in,
                                           Tensor *context_out,
                                           Tensor *new_cell) {
-  std::vector<std::pair<string, Tensor> > inputs;
-
   Tensor thisword(tensorflow::DT_INT32, {1, 1});
-
   thisword.scalar<int32>()() = word;
+
   std::vector<Tensor> outputs;
 
-  if (context_out != NULL) {
-    inputs = {
-      {"Train/Model/test_word_in", thisword},
-      {"Train/Model/test_word_out", thisword},
-      {"Train/Model/test_state_in", context_in},
-      {"Train/Model/test_cell_in", cell_in},
-    };
+  std::vector<std::pair<string, Tensor> > inputs = {
+    {word_id_tensor_name_, thisword},
+    {context_tensor_name_, context_in},
+  };
 
+  if (context_out != NULL) {
     // The session will initialize the outputs
     // Run the session, evaluating our "c" operation from the graph
-    Status status = session_->Run(inputs,
-        {"Train/Model/test_out",
-         "Train/Model/test_state_out",
-         "Train/Model/test_cell_out"}, {}, &outputs);
+    Status status = bundle_.session->Run(inputs,
+        {log_prob_tensor_name_,
+         rnn_out_tensor_name_,
+         rnn_states_tensor_name_}, {}, &outputs);
     if (!status.ok()) {
       KALDI_ERR << status.ToString();
     }
@@ -243,14 +277,9 @@ BaseFloat KaldiTfRnnlmWrapper::GetLogProb(int32 word,
     *context_out = outputs[1];
     *new_cell = outputs[2];
   } else {
-    inputs = {
-      {"Train/Model/test_word_out", thisword},
-      {"Train/Model/test_cell_in", cell_in},
-    };
-
     // Run the session, evaluating our "c" operation from the graph
-    Status status = session_->Run(inputs,
-        {"Train/Model/test_out"}, {}, &outputs);
+    Status status = bundle_.session->Run(inputs,
+        {log_prob_tensor_name_}, {}, &outputs);
     if (!status.ok()) {
       KALDI_ERR << status.ToString();
     }
diff --git a/src/tfrnnlm/tensorflow-rnnlm.h b/src/tfrnnlm/tensorflow-rnnlm.h
index 90b68755964..5b6b46bc64e 100644
--- a/src/tfrnnlm/tensorflow-rnnlm.h
+++ b/src/tfrnnlm/tensorflow-rnnlm.h
@@ -53,7 +53,7 @@
 #undef DCHECK_GE
 #undef DCHECK_NE
 
-#include "tensorflow/core/public/session.h"
+#include "tensorflow/cc/saved_model/loader.h"
 
 using tensorflow::Session;
 using tensorflow::Tensor;
@@ -97,9 +97,7 @@ class KaldiTfRnnlmWrapper {
                       const std::string &word_symbol_table_rxfilename,
                       const std::string &unk_prob_file,
                       const std::string &tf_model_path);
-  ~KaldiTfRnnlmWrapper() {
-    session_->Close();
-  }
+  ~KaldiTfRnnlmWrapper();
 
   int32 GetEos() const { return eos_; }
 
@@ -156,7 +154,14 @@ class KaldiTfRnnlmWrapper {
   // this corresponds to the RNNLM symbol table
   int32 num_rnn_words;
 
-  Session* session_;  // for TF computation; pointer owned here
+  // for TF computation
+  tensorflow::SavedModelBundle bundle_;
+  std::string word_id_tensor_name_;
+  std::string context_tensor_name_;
+  std::string log_prob_tensor_name_;
+  std::string rnn_out_tensor_name_;
+  std::string rnn_states_tensor_name_;
+  std::string initial_state_tensor_name_;
   int32 eos_;
   int32 oos_;
 
diff --git a/src/tfrnnlmbin/CMakeLists.txt b/src/tfrnnlmbin/CMakeLists.txt
new file mode 100644
index 00000000000..bea32fbac02
--- /dev/null
+++ b/src/tfrnnlmbin/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_kaldi_executable(NAME lattice-lmrescore-tf-rnnlm SOURCES lattice-lmrescore-tf-rnnlm.cc DEPENDS kaldi-tfrnnlm kaldi-lat)
+add_kaldi_executable(NAME lattice-lmrescore-tf-rnnlm-pruned SOURCES lattice-lmrescore-tf-rnnlm-pruned.cc DEPENDS kaldi-tfrnnlm kaldi-lat)